# Sentiment Analysis via the ML-based approach



### Part 1.a. Loading and Prep

Load, clean, and preprocess the data as you find necessary.

In [None]:
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [14]:
import pandas as pd
import re

#Below imports are commented as they are not final model
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.naive_bayes import MultinomialNB


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

import nltk 
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.corpus import stopwords


from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices


In [None]:
# TODO: import other libraries as necessary

df_train = pd.read_csv("sentiment_train.csv")
df_test = pd.read_csv("sentiment_test.csv")

lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords = lst_stopwords.remove("not")
ps = nltk.stem.porter.PorterStemmer()
lem = nltk.stem.wordnet.WordNetLemmatizer()
        

# pre-processing data
def clean_data(data):
    processed_data = []
    for sentence in data:
        #exclude special characters
        processed_feature = re.sub(r'\W', ' ', sentence)
        #exclude single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        #exclude multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Converting to Lowercase
        processed_feature = processed_feature.lower()
        
        lst_text = processed_feature.split()
        ## remove Stopwords
        if lst_stopwords is not None:
            lst_text = [word for word in lst_text if word not in lst_stopwords]
        
        lst_text = [ps.stem(word) for word in lst_text]
        lst_text = [lem.lemmatize(word) for word in lst_text]
        
        processed_data.append(" ".join(lst_text))
    return processed_data 

df_train['Sentence_clean'] = clean_data(df_train['Sentence'])
df_test['Sentence_clean'] = clean_data(df_test['Sentence'])

In [15]:
def vectorization_data(data1,data2):
        
    count_vect = CountVectorizer(ngram_range=(1, 3), token_pattern=r'\b\w+\b', min_df=1)
    X_train_counts = count_vect.fit_transform(data1.astype('U'))
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    X_new_counts = count_vect.transform(data2.astype('U'))
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    
    return X_train_tfidf, X_new_tfidf
Processed_features_train, Processed_features_test = vectorization_data(df_train['Sentence_clean'] , df_test['Sentence_clean'] )


In [43]:
Words_In_Train = []
Words_In_Train_0_Polarity = []
Words_In_Train_1_Polarity = []
row_iter = 0
for sentence in df_train['Sentence_clean']:
    for word in sentence.split(" "):
        Words_In_Train.append(word)
        if df_train['Polarity'][row_iter] == 0:
            Words_In_Train_0_Polarity.append(word)
        if df_train['Polarity'][row_iter] == 1:
            Words_In_Train_1_Polarity.append(word)
    
        
    row_iter = row_iter+ 1
    

### Part 1.b. Modeling

Use your favorite ML algorithm to train a classification model.  Don’t forget everything that we’ve learned in our ML course: hyperparameter tuning, cross validation, handling imbalanced data, etc. Make reasonable decisions and try to create the best-performing classifier that you can.

In [22]:
#Assumption -- it is okay to report positive comment as negative. but it is not okay to report negative comment as positive
#so, False Positives as okay. but False Negatives are not acceptable. so, the modfel should be trained on Recall
score = "recall"
print("*********** SVM ******************")
svm_tuned_parameters = [
    {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
    {"kernel": ["linear"], "C": [1, 10, 100, 1000]},
]



svm_grid = GridSearchCV(SVC(), svm_tuned_parameters, scoring= score)
svm_grid.fit(Processed_features_train, df_train['Polarity'])

print("SVM - Best parameters set found on Training Data:")
print()
print(svm_grid.best_params_)
print()
print("SVM - Results on Training Data:")
y_true, y_pred = df_train['Polarity'], svm_grid.predict(Processed_features_train)
print(classification_report(y_true, y_pred))
    
    
"""
#Naive Bias
print("************ Naive Bias ******************")
NB_params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}
multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=NB_params, n_jobs=-1, cv=5, verbose=5,scoring = score)
multinomial_nb_grid.fit(Processed_features_train, df_train['Polarity'])

y_true, y_pred = df_train['Polarity'], multinomial_nb_grid.predict(Processed_features_train)
print("Naive Bias - Best parameters set found on Training Data:")
print()
print(multinomial_nb_grid.best_params_)
print()
print("Naive Bias - Results on Train Data:")
print(classification_report(y_true, y_pred))


print("************ Random Forest ******************")
RF_param_grid = {'n_estimators': [ 100,200],'criterion':['gini', 'entropy']#'min_samples_split' : [2, 5, 10, 15, 100],#'min_samples_leaf' : [1, 2, 5, 10] 
                 ,'max_depth':[50,100,200,500,1000],'random_state':[0]}
rf = RandomForestClassifier()
# Instantiate the grid search model
RF_grid_search = GridSearchCV(estimator = rf, param_grid = RF_param_grid, cv = 3, n_jobs = 4, verbose = 2,scoring = score)
RF_grid_search.fit(Processed_features_train, df_train['Polarity'])
y_true, y_pred = df_train['Polarity'], RF_grid_search.predict(Processed_features_train)
print(" Random Forest  - Best parameters set found on Training Data:")
print()
print(RF_grid_search.best_params_)
print()

    
print(" Random Forest - Results on Train Data:")
print(classification_report(y_true, y_pred))""";


*********** SVM ******************
SVM - Best parameters set found on Training Data:

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

SVM - Results on Training Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1213
           1       1.00      1.00      1.00      1187

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400



### Part 1.c. Assessing

Use the testing data to measure the accuracy and F1-score of your model.  

In [21]:

print("SVM - Results on Test Data:")
y_true, y_pred = df_test['Polarity'], svm_grid.predict(Processed_features_test)
print(classification_report(y_true, y_pred))


"""print("Naive Bias - Results on Test Data:")
y_true, y_pred = df_test['Polarity'], multinomial_nb_grid.predict(Processed_features_test)
print(classification_report(y_true, y_pred))



print(" Random Forest - Results on Test Data:")
y_true, y_pred = df_test['Polarity'], RF_grid_search.predict(Processed_features_test)
print(classification_report(y_true, y_pred))""";

SVM - Results on Test Data:
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       287
           1       0.83      0.74      0.78       313

    accuracy                           0.78       600
   macro avg       0.79      0.79      0.78       600
weighted avg       0.79      0.78      0.78       600



### Part 2. Given the accuracy and F1-score of your model, are you satisfied with the results, from a business point of view? Explain.

The models are optimized for **recall**. because, we dont want to miss any negative comment. so, the recall for the model should be good enough to not miss any negative sentiments.

I choose SVM model as by definition, it gives maximum speration between classes. also, the recall & f1 score is highest

so, it means, **with 83% probability this model is able to catch the** ***every*** **negative sentiment**. so, i am satisfied from business 

### Part 3. Show five example instances in which your model’s predictions were incorrect. Describe why you think the model was wrong. Don’t just guess: dig deep to figure out the root cause.

By digging, i figured out **not** is removed by stopwords. so i excluded that from stop words and boosted the performance of model

80% of the mismatched data have equal words from 0,1 polarity from the train data and hence the model did not pick up right tag

In [48]:
df_train['predict_score'] = svm_grid.predict(Processed_features_train)
unmatched_data = df_train[df_train['predict_score'] != df_train['Polarity']]
unmatched_data

Unnamed: 0,Sentence,Polarity,Sentence_clean,predict_score
71,#NAME?,1,name,0
219,#NAME?,1,name,0


***In Train data, the junk data resulted in incorrect predictions.***

In [49]:
df_test['predict_score'] = svm_grid.predict(Processed_features_test)
unmatched_data = df_test[df_test['predict_score'] != df_test['Polarity']]

In [50]:
def words_in_train(sentence,Words_In_Train):
    count = 0
    for word in sentence.split(" "):
        if word in Words_In_Train:
            count = count + 1
    return count

In [51]:
unmatched_data['words_in_train'] = [ words_in_train(sentence,Words_In_Train) for sentence in  unmatched_data['Sentence_clean'] ]
unmatched_data['words_match_in_0_Polarity'] = [ words_in_train(sentence,Words_In_Train_0_Polarity) for sentence in  unmatched_data['Sentence_clean'] ]
unmatched_data['words_match_in_1_Polarity'] = [ words_in_train(sentence,Words_In_Train_1_Polarity) for sentence in  unmatched_data['Sentence_clean'] ]
unmatched_data['differntiating_words_ratio'] = unmatched_data['words_match_in_0_Polarity'] /unmatched_data['words_match_in_1_Polarity']

In [54]:
import plotly.express as px
fig = px.histogram(unmatched_data['differntiating_words_ratio'] )
fig.show()

In [55]:
fig = px.ecdf(unmatched_data['differntiating_words_ratio'] )
fig.show()

# 80% of the mismatched data have equal words from 0,1 polarity from the train data and hence the model did not pick up right tag