In [1]:
import pandas as pd

In [2]:
# Read in and examine the imdb reviews dataset
imdb = pd.read_csv('CSVs/IMDB Dataset.csv')
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [4]:
# Change sentiment to match format of other two datasets--2 for positive, 1 for negative
imdb.sentiment = imdb.sentiment.apply(lambda x: 2 if x=='positive' else 1)

In [5]:
# Put the columns in order
imdb = imdb[['sentiment','review']]
imdb.head()

Unnamed: 0,sentiment,review
0,2,One of the other reviewers has mentioned that ...
1,2,A wonderful little production. <br /><br />The...
2,2,I thought this was a wonderful way to spend ti...
3,1,Basically there's a family where a little boy ...
4,2,"Petter Mattei's ""Love in the Time of Money"" is..."


In [6]:
# Read in and examine the yelp reviews dataset
yelp_train = pd.read_csv('CSVs/yelp_train.csv',header=None)
yelp_train.head()

Unnamed: 0,0,1
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [7]:
yelp_train.columns = ['sentiment','review']
yelp_train.head()

Unnamed: 0,sentiment,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [16]:
yelp_test = pd.read_csv('CSVs/yelp_test.csv',header=None)
yelp_test.head()

Unnamed: 0,0,1
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...


In [17]:
yelp_test.columns  = ['sentiment','review']
yelp_test.head()

Unnamed: 0,sentiment,review
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...


In [8]:
def preprocess_amazon(input_file, output_file, encoding):
    '''Amazon files are in the format __label__[1,2] [text].
    Create a processed file in the format [1,2]\t[text]'''
    with open(input_file, 'r',encoding=file_encoding) as infile, open(output_file, 'w',encoding=file_encoding) as outfile:
        i = 0
        for line in infile: 
            processed_line = line.strip('__label__')
            processed_line = processed_line[0] + '\t' + processed_line[2:]
            outfile.write(processed_line + '\n')


In [9]:
test_input_file = 'CSVs/amazon_test.ft.txt'
test_output_file = 'CSVs/amazon_test_processed.txt'
train_input_file = 'CSVs/amazon_train.ft.txt'
train_output_file = 'CSVs/amazon_train_processed.txt'
file_encoding = 'utf-8'

In [10]:
# Create new, tab-delimited files
preprocess_amazon(test_input_file,test_output_file,file_encoding)

In [11]:
preprocess_amazon(train_input_file,train_output_file,file_encoding)

In [12]:
amazon_test = pd.read_csv('CSVs/amazon_test_processed.txt',sep='\t',header=None,names=['sentiment','review'])
amazon_test.head()

Unnamed: 0,sentiment,review
0,2,Great CD: My lovely Pat has one of the GREAT v...
1,2,One of the best game music soundtracks - for a...
2,1,Batteries died within a year ...: I bought thi...
3,2,"works fine, but Maha Energy is better: Check o..."
4,2,Great for the non-audiophile: Reviewed quite a...


In [13]:
amazon_train = pd.read_csv('CSVs/amazon_train_processed.txt',sep='\t',header=None,names=['sentiment','review'])
amazon_train.head()

Unnamed: 0,sentiment,review
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."


In [30]:
# Recombine data that was split in original dataset
yelp = pd.concat([yelp_train,yelp_test],ignore_index=True)
len(yelp)

598000

In [32]:
amazon = pd.concat([amazon_train,amazon_test],ignore_index=True)
len(amazon)

3999612

In [24]:
len(imdb)

50000

In [33]:
# Label data sources for stratification
imdb['source'] = 'imdb'
yelp['source'] = 'yelp'
amazon['source'] = 'amazon'

In [35]:
reviews = pd.concat([imdb, yelp, amazon],ignore_index=True)

In [36]:
reviews['sentiment'].unique()

array([2, 1], dtype=int64)

In [37]:
from sklearn.model_selection import train_test_split

In [40]:
# Split combined data into train/test, stratified by data source
X_train, X_test, y_train, y_test = train_test_split(reviews['review'],reviews['sentiment'],test_size=0.1,stratify=reviews['source'],random_state=42)

In [45]:
print(len(y_train),len(y_test))

4182850 464762


In [48]:
y_train.head()

2522958    1
1160125    2
861121     1
300957     1
1610389    2
Name: sentiment, dtype: int64

In [49]:
X_train.head()

2522958    Doggy Hoots Cracklers: Not as enthusiastic as ...
1160125    great product: Perfect product for freezing mi...
861121     Awesome Trimmer, Lousy Power: It pains me to g...
300957     Oh, T.I.  You should be ASHAMED to even call t...
1610389    This is one of my favorite movies: Forest Gump...
Name: review, dtype: object

In [50]:
# Save split data for later use
X_train.to_csv('CSVs/X_train_full.csv')
X_test.to_csv('CSVs/X_test_full.csv')
y_train.to_csv('CSVs/y_train_full.csv')
y_test.to_csv('CSVs/y_test_full.csv')