In [None]:
import pandas as pd

import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize,word_tokenize
import numpy as np
from nltk.corpus import stopwords
import string
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
import re

from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,confusion_matrix




Dataset

In [None]:
Dataset = pd.read_csv("Q2 Sentiment Analysis Dataset.csv",encoding='latin1')
Dataset.head()

Unnamed: 0,id,sentiment,date,text,Unnamed: 4,Unnamed: 5
0,623495523,1,Mon Dec 01 20:46:01 +0000 2014,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,,
1,623495527,1,Mon Dec 01 21:09:50 +0000 2014,@apple Contact sync between Yosemite and iOS8 ...,,
2,623495529,1,Mon Dec 01 21:35:14 +0000 2014,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,,
3,623495536,1,Mon Dec 01 23:55:55 +0000 2014,"@Apple, For the love of GAWD, CENTER the '1'on...",,
4,623495537,1,Tue Dec 02 00:06:05 +0000 2014,i get the storage almost full notification lit...,,


In [None]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3886 entries, 0 to 3885
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          3886 non-null   int64  
 1   sentiment   3886 non-null   object 
 2   date        3886 non-null   object 
 3   text        3886 non-null   object 
 4   Unnamed: 4  0 non-null      float64
 5   Unnamed: 5  0 non-null      float64
dtypes: float64(2), int64(1), object(3)
memory usage: 182.3+ KB


In [None]:
df = Dataset.drop(columns=['id','date','Unnamed: 4', 'Unnamed: 5'])
df.head()

Unnamed: 0,sentiment,text
0,1,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
1,1,@apple Contact sync between Yosemite and iOS8 ...
2,1,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...
3,1,"@Apple, For the love of GAWD, CENTER the '1'on..."
4,1,i get the storage almost full notification lit...


In [None]:
df['sentiment'] = df['sentiment'].replace({'1': 'negative', '3': 'neutral', '5': 'positive'})
df.sentiment.value_counts().head()

neutral         2162
negative        1219
positive         423
not_relevant      82
Name: sentiment, dtype: int64

In [None]:
df.head()

Unnamed: 0,sentiment,text
0,negative,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
1,negative,@apple Contact sync between Yosemite and iOS8 ...
2,negative,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...
3,negative,"@Apple, For the love of GAWD, CENTER the '1'on..."
4,negative,i get the storage almost full notification lit...


handle missing values

In [None]:
print(df.isna().sum())

sentiment    0
text         0
dtype: int64


In [None]:
def preprocess_text(text):
    text = text.str.replace(r'[\(\[].*?[\)\]]', '')  # Remove text in square brackets and parentheses
    text = text.str.replace('\n', ' ')  # Remove newline characters
    text = text.str.lower()  # Convert text to lowercase
    text = text.str.replace(r'\d+', '')  # Remove numbers
    text = text.str.replace(r'[@#]', '')  # Remove '@' and '#' characters
    text = text.str.replace('[{}]'.format(string.punctuation), '')  # Remove punctuation

    clean_text = []
    ignore = set(stopwords.words('english'))  # Remove stopwords from text

    for i in text:
        words = nltk.word_tokenize(i)
        words = [word for word in words if word not in ignore and len(word) > 1]
        res_text = " ".join(words)
        clean_text.append(res_text)

    return clean_text

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
processed_df=pd.DataFrame(columns=['old_text','text'])
processed_df['old_text'] = df['text']
processed_df['text'] = preprocess_text(df['text'])
processed_df['text_lemmatized'] = processed_df['text'].apply(lemmatize_text)
processed_df['text_separated'] = processed_df['text_lemmatized'].apply(str)
processed_df['sentiment'] = df['sentiment']

processed_df.head()

  text = text.str.replace(r'[\(\[].*?[\)\]]', '')# Remove text in square brackets and parentheses
  text = text.str.replace(r'\d+', '')# Remove numbers
  text = text.str.replace(r'[@#]', '')# Remove '@' and '#' characters
  text = text.str.replace('[{}]'.format(string.punctuation), '') # Remove punctuation


Unnamed: 0,old_text,text,text_lemmatized,text_separated,sentiment
0,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,wtf battery one second ago wtf apple,wtf battery one second ago wtf apple,wtf battery one second ago wtf apple,negative
1,@apple Contact sync between Yosemite and iOS8 ...,apple contact sync yosemite ios seriously scre...,apple contact sync yosemite io seriously screw...,apple contact sync yosemite io seriously screw...,negative
2,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,warning buy iphone unlocked apple iphone use v...,warning buy iphone unlocked apple iphone use v...,warning buy iphone unlocked apple iphone use v...,negative
3,"@Apple, For the love of GAWD, CENTER the '1'on...",apple love gawd center damn calendar app fixed...,apple love gawd center damn calendar app fixed...,apple love gawd center damn calendar app fixed...,negative
4,i get the storage almost full notification lit...,get storage almost full notification literally...,get storage almost full notification literally...,get storage almost full notification literally...,negative


In [None]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3886 entries, 0 to 3885
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   old_text         3886 non-null   object
 1   text             3886 non-null   object
 2   text_lemmatized  3886 non-null   object
 3   text_separated   3886 non-null   object
 4   sentiment        3886 non-null   object
dtypes: object(5)
memory usage: 151.9+ KB


### Analysis Dataset

In [None]:
analysis_df = pd.DataFrame(columns=['word','Sentiment'])
analysis_df.word = processed_df.text_separated.apply(str)
analysis_df.Sentiment = processed_df.sentiment
analysis_df.head()


Unnamed: 0,word,Sentiment
0,wtf battery one second ago wtf apple,negative
1,apple contact sync yosemite io seriously screw...,negative
2,warning buy iphone unlocked apple iphone use v...,negative
3,apple love gawd center damn calendar app fixed...,negative
4,get storage almost full notification literally...,negative


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
word_series=pd.DataFrame()
label_encoder = LabelEncoder()
word_series['Word'] = analysis_df.word
word_series['sentiment'] = label_encoder.fit_transform(analysis_df['Sentiment'])
sentiments = word_series['sentiment'].unique()

word_series.head()

Unnamed: 0,Word,sentiment
0,wtf battery one second ago wtf apple,0
1,apple contact sync yosemite io seriously screw...,0
2,warning buy iphone unlocked apple iphone use v...,0
3,apple love gawd center damn calendar app fixed...,0
4,get storage almost full notification literally...,0


In [None]:
word_series.sentiment.value_counts()

1    2162
0    1219
3     423
2      82
Name: sentiment, dtype: int64

Spliting the data

In [None]:
# split data
X=analysis_df.word
y = word_series.sentiment
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (3108,) (3108,)
Testing set shape: (778,) (778,)


> Techniques:
<ol>
<li>Bag of words based on raw counts </li>
<li>Bag of words based on TfIDF </li>
<li>ngrams (unigrams, bigrams, trigrams)</li> </ol>

1. Bag of words based on raw counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vectorizer = CountVectorizer()
X_train_count_vectorizer = count_vectorizer.fit_transform(X_train)
X_test_count_vectorizer = count_vectorizer.transform(X_test)

2. Bag of words based on TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

3. Ngrams

In [None]:
ngram_range = (1, 3)
count_vectorizer = CountVectorizer( ngram_range=ngram_range)

X_train_ngram =count_vectorizer.fit_transform(X_train)
X_test_ngram = count_vectorizer.transform(X_test)


> Classifiers:

In [None]:
def get_logisticRegression(X_train, X_test, y_train, y_test):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    return classification_report(predictions, y_test)


def get_naiveBayes(X_train, X_test, y_train, y_test):
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    return classification_report(predictions, y_test)

def get_randomForest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)
    return classification_report(predictions, y_test)

def get_svm(X_train, X_test, y_train, y_test):
    svm = SVC()
    svm.fit(X_train, y_train)
    predictions = svm.predict(X_test)
    return classification_report(predictions, y_test)


def get_perceptron(X_train, X_test, y_train, y_test):
    perceptron = Perceptron()
    perceptron.fit(X_train, y_train)
    predictions = perceptron.predict(X_test)
    return classification_report(predictions, y_test)

def classify(method, X_train, X_test, y_train, y_test):
    print("Logistic Regression for", method, "\n", get_logisticRegression(X_train, X_test, y_train, y_test))
    print("Random Forest for", method, "\n", get_randomForest(X_train, X_test, y_train, y_test))
    print("Naive Bayes for", method, "\n", get_naiveBayes(X_train, X_test, y_train, y_test))
    print("SVM for", method, "\n", get_svm(X_train, X_test, y_train, y_test))
    print("Perceptron for", method, "\n", get_perceptron(X_train, X_test, y_train, y_test))


In [None]:

classify("Bag of words",X_train_count_vectorizer,X_test_count_vectorizer,y_train,y_test)


Logistic Regression for  Bag of words 
               precision    recall  f1-score   support

           0       0.65      0.76      0.70       223
           1       0.89      0.70      0.78       512
           2       0.05      0.33      0.08         3
           3       0.31      0.72      0.44        40

    accuracy                           0.72       778
   macro avg       0.47      0.63      0.50       778
weighted avg       0.79      0.72      0.74       778



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest for  Bag of words 
               precision    recall  f1-score   support

           0       0.61      0.79      0.69       203
           1       0.90      0.68      0.77       538
           2       0.00      0.00      0.00         0
           3       0.27      0.68      0.38        37

    accuracy                           0.71       778
   macro avg       0.45      0.54      0.46       778
weighted avg       0.80      0.71      0.73       778

Naive Bayes for Bag of words 
               precision    recall  f1-score   support

           0       0.75      0.67      0.71       291
           1       0.83      0.72      0.77       460
           2       0.05      1.00      0.09         1
           3       0.19      0.69      0.30        26

    accuracy                           0.70       778
   macro avg       0.45      0.77      0.47       778
weighted avg       0.78      0.70      0.73       778

SVM for  Bag of words 
               precision    recall  f1-sco

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
classify("Bag of words with TFIDF",X_train_tfidf,X_test_tfidf,y_train,y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression for  Bag of words with TFIDF 
               precision    recall  f1-score   support

           0       0.64      0.75      0.69       223
           1       0.90      0.68      0.77       533
           2       0.00      0.00      0.00         0
           3       0.20      0.86      0.33        22

    accuracy                           0.70       778
   macro avg       0.44      0.57      0.45       778
weighted avg       0.80      0.70      0.74       778



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest for  Bag of words with TFIDF 
               precision    recall  f1-score   support

           0       0.59      0.76      0.67       205
           1       0.89      0.66      0.76       542
           2       0.00      0.00      0.00         0
           3       0.22      0.65      0.32        31

    accuracy                           0.69       778
   macro avg       0.42      0.52      0.44       778
weighted avg       0.78      0.69      0.72       778

Naive Bayes for Bag of words with TFIDF 
               precision    recall  f1-score   support

           0       0.62      0.78      0.69       210
           1       0.93      0.67      0.78       559
           2       0.00      0.00      0.00         0
           3       0.08      0.78      0.14         9

    accuracy                           0.70       778
   macro avg       0.41      0.56      0.40       778
weighted avg       0.84      0.70      0.75       778

SVM for  Bag of words with TFIDF 
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
classify("Ngrams",X_train_ngram,X_test_ngram,y_train,y_test)

Logistic Regression for  Ngrams 
               precision    recall  f1-score   support

           0       0.64      0.77      0.70       217
           1       0.90      0.69      0.78       524
           2       0.00      0.00      0.00         1
           3       0.27      0.69      0.39        36

    accuracy                           0.71       778
   macro avg       0.45      0.54      0.47       778
weighted avg       0.79      0.71      0.74       778



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest for  Ngrams 
               precision    recall  f1-score   support

           0       0.54      0.84      0.66       169
           1       0.93      0.65      0.77       573
           2       0.00      0.00      0.00         0
           3       0.26      0.67      0.37        36

    accuracy                           0.69       778
   macro avg       0.43      0.54      0.45       778
weighted avg       0.81      0.69      0.73       778

Naive Bayes for Ngrams 
               precision    recall  f1-score   support

           0       0.72      0.70      0.71       267
           1       0.84      0.71      0.77       478
           2       0.05      0.50      0.09         2
           3       0.23      0.68      0.34        31

    accuracy                           0.70       778
   macro avg       0.46      0.65      0.48       778
weighted avg       0.77      0.70      0.73       778

SVM for  Ngrams 
               precision    recall  f1-score   support

    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
