## import libraries


In [8]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tengyue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## read the raw data

In [9]:
df = pd.read_csv('data/reviews.csv')
df['context'] = df.apply(lambda x : str(x['review_title']) + str(x['review_body']),axis=1)
df

Unnamed: 0,review_id,product_id,review_title,review_body,sentiment,context
0,3227267,B003EYVXV4,"""The Hunger Games"" is a Well-Constructed ""Chim...",***This review may contain spoilers***So what ...,Positive,"""The Hunger Games"" is a Well-Constructed ""Chim..."
1,801848,B007SVLWII,GREAT!!!!!,this game is the best game I have ever played ...,Positive,GREAT!!!!!this game is the best game I have ev...
2,3695888,B003ZYF1NE,Satisfied Customers,"It's comfortable, it's light and it's machine ...",Positive,"Satisfied CustomersIt's comfortable, it's ligh..."
3,1357666,B0007SL1ZI,The Greatest!!!!,THIS GAME IS REALLY GREAT YOU SHOULD BUY IT......,Positive,The Greatest!!!!THIS GAME IS REALLY GREAT YOU ...
4,1968126,0788816454,Love the movie!,"Great Adam Sandler movie, a classic! And on Bl...",Positive,"Love the movie!Great Adam Sandler movie, a cla..."
...,...,...,...,...,...,...
49995,1132230,B00CWY76CC,Really don't care for this game.,I have tried this game out for a couple of day...,Negative,Really don't care for this game.I have tried t...
49996,2939710,B000N4SHOE,I loved it!,I enjoyed watching this movie very much. It w...,Positive,I loved it!I enjoyed watching this movie very ...
49997,2468259,B00005IC0E,Very Good!,"I'm from the United Kingdom, and the title her...",Positive,"Very Good!I'm from the United Kingdom, and the..."
49998,1102407,B00CASLGJY,bad,this game is just over all bad. I couldn't bel...,Negative,badthis game is just over all bad. I couldn't ...


## preprocess

In [10]:
positive = df[df['sentiment'] == 'Positive']
negative = df[df['sentiment'] == 'Negative']

def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

# Preprocessing the text, including restoring part of speech and removing stop words
df['context'] = df['context'].apply(lambda x : ' '.join(preprocess(x)))
df = df[['context', 'sentiment']]
df['sentiment'] = df['sentiment'].apply(lambda x : 1 if x == 'Positive' else -1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,context,sentiment
0,hunger game construct chimera movi larg suppor...,1
1,great game best game play kindl awesom new gam...,1
2,satisfi customersit comfort light machin washa...,1
3,greatest game great buy play anakin obiwan yod...,1
4,love movi great adam sandler movi classic blu ...,1
...,...,...
49995,care game tri game coupl day past certain poin...,-1
49996,love enjoy watch movi engag keep attent begin ...,1
49997,good unit kingdom titl movi contagion attract ...,1
49998,badthi game bad couldn believ cheap graphic po...,-1


## random split train and test data

In [11]:
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## run the model

In [12]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['context'])
test_matrix = vectorizer.transform(test['context'])

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

X_train = train_matrix
X_test = test_matrix
y_train = train['sentiment']
y_test = test['sentiment']
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

## prediction and test

In [13]:
predictions = lr.predict(X_test)

# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predictions,y_test)
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

          -1       0.57      0.73      0.64      1171
           1       0.96      0.93      0.95      9305

    accuracy                           0.91     10476
   macro avg       0.77      0.83      0.79     10476
weighted avg       0.92      0.91      0.91     10476



## try to fit the model under different type

In [5]:
df = pd.read_csv('data/reviews.csv')
df['context'] = df.apply(lambda x : str(x['review_title']) + str(x['review_body']),axis=1)
prediction_df = pd.read_csv('task1a.csv')
merge_data = pd.merge(df, prediction_df)
merge_data

Unnamed: 0,review_id,product_id,review_title,review_body,sentiment,context,product_category
0,3227267,B003EYVXV4,"""The Hunger Games"" is a Well-Constructed ""Chim...",***This review may contain spoilers***So what ...,Positive,"""The Hunger Games"" is a Well-Constructed ""Chim...",5
1,801848,B007SVLWII,GREAT!!!!!,this game is the best game I have ever played ...,Positive,GREAT!!!!!this game is the best game I have ev...,0
2,3695888,B003ZYF1NE,Satisfied Customers,"It's comfortable, it's light and it's machine ...",Positive,"Satisfied CustomersIt's comfortable, it's ligh...",4
3,1357666,B0007SL1ZI,The Greatest!!!!,THIS GAME IS REALLY GREAT YOU SHOULD BUY IT......,Positive,The Greatest!!!!THIS GAME IS REALLY GREAT YOU ...,0
4,1968126,0788816454,Love the movie!,"Great Adam Sandler movie, a classic! And on Bl...",Positive,"Love the movie!Great Adam Sandler movie, a cla...",5
...,...,...,...,...,...,...,...
49995,1132230,B00CWY76CC,Really don't care for this game.,I have tried this game out for a couple of day...,Negative,Really don't care for this game.I have tried t...,0
49996,2939710,B000N4SHOE,I loved it!,I enjoyed watching this movie very much. It w...,Positive,I loved it!I enjoyed watching this movie very ...,5
49997,2468259,B00005IC0E,Very Good!,"I'm from the United Kingdom, and the title her...",Positive,"Very Good!I'm from the United Kingdom, and the...",5
49998,1102407,B00CASLGJY,bad,this game is just over all bad. I couldn't bel...,Negative,badthis game is just over all bad. I couldn't ...,0


## preprocess and train

In [7]:
# save the performance under the each type
performance_score = []

# process and train the model under different types
for i in range(0, 6):
    type_df = merge_data[merge_data['product_category'] == i]
    positive = type_df[type_df['sentiment'] == 'Positive']
    negative = type_df[type_df['sentiment'] == 'Negative']

    # Preprocessing the text, including restoring part of speech and removing stop words
    type_df['context'] = type_df['context'].apply(lambda x : ' '.join(preprocess(x)))
    simple_df = type_df[['context', 'sentiment']]
    simple_df['sentiment'] = simple_df['sentiment'].apply(lambda x : 1 if x == 'Positive' else -1)

    # random split train and test data
    index = simple_df.index
    simple_df['random_number'] = np.random.randn(len(index))
    train = simple_df[simple_df['random_number'] <= 0.8]
    test = simple_df[simple_df['random_number'] > 0.8]

    # run the mode
    # count vectorizer:
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    train_matrix = vectorizer.fit_transform(train['context'])
    test_matrix = vectorizer.transform(test['context'])

    # Logistic Regression

    lr = LogisticRegression()

    X_train = train_matrix
    X_test = test_matrix
    y_train = train['sentiment']
    y_test = test['sentiment']
    lr.fit(X_train,y_train)

    # prediction and test
    predictions = lr.predict(X_test)

    # find accuracy, precision, recall:
    new = np.asarray(y_test)
    confusion_matrix(predictions,y_test)
    print(classification_report(predictions,y_test))
    performance_score.append(classification_report(predictions,y_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://s

              precision    recall  f1-score   support

          -1       0.64      0.78      0.71       424
           1       0.97      0.94      0.95      2884

    accuracy                           0.92      3308
   macro avg       0.81      0.86      0.83      3308
weighted avg       0.93      0.92      0.92      3308



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


              precision    recall  f1-score   support

          -1       0.39      0.54      0.45        74
           1       0.94      0.90      0.92       632

    accuracy                           0.86       706
   macro avg       0.67      0.72      0.69       706
weighted avg       0.89      0.86      0.87       706



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


              precision    recall  f1-score   support

          -1       0.38      0.56      0.45        34
           1       0.95      0.90      0.92       296

    accuracy                           0.86       330
   macro avg       0.66      0.73      0.69       330
weighted avg       0.89      0.86      0.87       330



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://s

              precision    recall  f1-score   support

          -1       0.43      0.61      0.50       174
           1       0.96      0.92      0.94      1708

    accuracy                           0.89      1882
   macro avg       0.69      0.76      0.72      1882
weighted avg       0.91      0.89      0.90      1882



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


              precision    recall  f1-score   support

          -1       0.41      0.62      0.49       112
           1       0.97      0.93      0.95      1522

    accuracy                           0.91      1634
   macro avg       0.69      0.77      0.72      1634
weighted avg       0.93      0.91      0.92      1634



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


              precision    recall  f1-score   support

          -1       0.56      0.75      0.64       273
           1       0.97      0.93      0.95      2343

    accuracy                           0.91      2616
   macro avg       0.77      0.84      0.80      2616
weighted avg       0.93      0.91      0.92      2616



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## predict the unlabeled data

In [15]:
read_test = pd.read_csv('data/no_ratings.csv')
read_test['context'] = read_test.apply(lambda x : str(x['review_title']) + str(x['review_body']),axis=1)
# Preprocessing the text, including restoring part of speech and removing stop words
read_test['context'] = read_test['context'].apply(lambda x : ' '.join(preprocess(x)))
read_test = read_test[['review_id', 'context']]
read_test

Unnamed: 0,review_id,context
0,1577265,tast like chees good altern vegan long time or...
1,774185,great list app list app develop use leav revie...
2,3722095,dog wild themth dog love actual start bark din...
3,359962,brokethi million vibrat buy year break fair qu...
4,1735617,listen bad reviewsi amazon local box star pos ...
...,...,...
5495,1660361,greati chocolatti wasn abl tast peppermint order
5496,1404529,amaz newth game far east agre hideo kojima say...
5497,1343547,new like good game hard use compon cabl xbox r...
5498,1708593,short life expensivei lot dewalt batteri nicd ...


In [19]:
test_matrix = vectorizer.transform(read_test['context'])
predictions = lr.predict(test_matrix)
predictions

array([ 1,  1,  1, ..., -1,  1,  1], dtype=int64)

In [21]:
read_test['sentiment'] = predictions
del read_test['context']
read_test

Unnamed: 0,review_id,sentiment
0,1577265,1
1,774185,1
2,3722095,1
3,359962,1
4,1735617,1
...,...,...
5495,1660361,1
5496,1404529,-1
5497,1343547,-1
5498,1708593,1


In [22]:
read_test.to_csv('task 1b.csv', index=None)