Goal:

Sentiment analysis of tweets

In [1]:
### Prepare dataframe
import pandas as pd

dataframe = pd.read_csv("Twitter_Sentiment.csv")

# Drop useless columns and empty rows
dataframe = dataframe.drop(['0', '1'], axis = 1)
dataframe = dataframe.rename(columns = {'2' : "Sentiment", '3' : "Tweet"})
dataframe = dataframe.dropna( )

# Reset indicies
dataframe.reset_index(drop = True, inplace = True)

In [2]:
### Text Preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stopwords = stopwords.words( "english" )
corpus = []

duplicates = []
already_exists = set()

for i in dataframe.index:
  phrase = dataframe.loc[i, "Tweet"]

  # 1 # Remove non letters
  phrase = re.sub( "[^a-zA-Z]", " ", phrase ).lower()

  # print(f"Original phrase: {phrase}")

  # 2 # Lemmatization
  lemmatizer = WordNetLemmatizer()
  phrase = [lemmatizer.lemmatize(word) for word in phrase.split() if word not in stopwords]
  phrase = " ".join(phrase)

  # print(f"Lemmaitzed phrase: {phrase}")

  # 2 # Remove stopwords and Remove suffixes (Stemming)
  stemmer = PorterStemmer()
  phrase = [stemmer.stem( word ) for word in phrase.split() if word not in stopwords]
  phrase = " ".join(phrase)

  # print(f"Stemmed phrase: {phrase}")

  # 3 # Check if the phrase already exists
  if (phrase in already_exists) or (phrase == "") or (phrase == " "):
  # 3a # Remove duplicates from the dataframe
    dataframe = dataframe.drop( i )

  elif not (phrase in already_exists) :
    # 3b # Store unique phrase in corpus
    corpus.append( phrase )
    # 3c # Store phrase to keep track of duplicates
    already_exists.add( phrase )
    
  # print("-----")

print(f"Dataset's Length: {len(dataframe)}, Corpus' Length: {len(corpus)}")

Dataset's Length: 59838, Corpus' Length: 59838


In [3]:
# Create Bag of Words and label
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
features = vectorizer.fit_transform( corpus ).toarray()
label = dataframe["Sentiment"]
label = label.reset_index( drop = True ) # To later remove the unfrequent words

print(f"Number of phrases: {len(features)}\nNumber of words in the corpus: {len(features[0])}")

Number of phrases: 59838
Number of words in the corpus: 22774


In [4]:
# Checking the count of words in the bag of words
words = vectorizer.get_feature_names_out() # get the words
BoW_df = pd.DataFrame(features, columns = words) # create dataframe of words and their count

# Create a dictionary of {Word : Number of times it was used}
number_of_times_word_is_used = {}
for word in BoW_df.columns:
  total = sum(BoW_df[word])
  number_of_times_word_is_used[word] = total

In [5]:
### Use the dictionary to clean the corpus by removing the words that are not frequently found (counted less than 40 times)

# Store all the words that were counted less than 40 times
not_frequent_words = set()
for word, count in number_of_times_word_is_used.items():
  if count < 40:
    not_frequent_words.add( word )

print(f"Number of words counted less than 40 times: {len(not_frequent_words)}\n")

Number of words counted less than 40 times: 20436



In [6]:
### Remove the unfrequent words from the corpus

indicies_to_remove = [] # To later remove them from the label and the corpus

for i in range(len(corpus)):
  phrase = corpus[i]
  new_phrase = []

  # Keep the words that are not in the not_frequent list  
  for word in phrase.split():
    if not (word in not_frequent_words):
      new_phrase.append(word)
  
  new_phrase = " ".join(new_phrase)
  # If the phrase is empty (all the words in the phrase are not frequent), save the phrase's index to remove it and remove its label 
  if new_phrase == "":
    indicies_to_remove.append(i)
  
  # Replace the current phrase in the corpus with the new_phrase
  corpus[i] = new_phrase


# Remove the empty phrases and their label
for index in indicies_to_remove:
  label = label.drop( index )
  corpus.remove( "" )

In [7]:
# Create new Bag of Words with new features

vectorizer = CountVectorizer()
features = vectorizer.fit_transform( corpus ).toarray()

print(f"Number of phrases: {len(features)}\nNumber of words in the corpus: {len(features[0])}\nNumber of labels: {len(label)}")

Number of phrases: 59517
Number of words in the corpus: 2338
Number of labels: 59517


In [8]:
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( features, label.values, test_size = 0.2 )

print(f"X_train: {len(X_train)}, y_train: {len(y_train)}, X_test: {len(X_test)}, y_test: {len(y_test)}")
print(f"Length of element in X_train: {len(X_train[0])}")

# Prepare evaluation imports
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

X_train: 47613, y_train: 47613, X_test: 11904, y_test: 11904
Length of element in X_train: 2338


In [9]:
### Build Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression( max_iter = 10000 )
LR.fit( X_train, y_train )
LR_prediction = LR.predict( X_test )

LR_matrix = confusion_matrix( y_test, LR_prediction )
LR_accuracy = accuracy_score( y_test, LR_prediction )
print(f"Logistic Regression\n{LR_matrix}\n\nAccuracy: {LR_accuracy:.2}\n")

LR_score = cross_val_score(estimator = LR, y = y_test, X = X_test, cv = 10 )
print(f"\nScore: {LR_score}\n\nMean Score: {LR_score.mean():.2}")

Logistic Regression
[[1071  360  260  421]
 [ 200 2694  300  335]
 [ 298  453 1812  513]
 [ 252  382  363 2190]]

Accuracy: 0.65


Score: [0.6070529  0.59529807 0.58942065 0.61125105 0.60168067 0.62521008
 0.62436975 0.61512605 0.61596639 0.62436975]

Mean Score: 0.61


In [10]:
### Build Decision Tree
from sklearn.tree import DecisionTreeClassifier
TREE = DecisionTreeClassifier(  )
TREE.fit( X_train, y_train )
TREE_prediction = TREE.predict( X_test )

TREE_matrix = confusion_matrix( y_test, TREE_prediction )
TREE_accuracy = accuracy_score( y_test, TREE_prediction )
TREE_score = cross_val_score(estimator = TREE, y = y_test, X = X_test, cv = 10 )
print(f"Decision Tree\n{TREE_matrix}\n\nAccuracy: {TREE_accuracy:.2}\n\nScore: {TREE_score}\n\nMean Score: {TREE_score.mean():.2}")

Decision Tree
[[1585  178  126  223]
 [ 183 2921  191  234]
 [ 174  233 2456  213]
 [ 271  260  195 2461]]

Accuracy: 0.79

Score: [0.59865659 0.59109992 0.60033585 0.58354324 0.58823529 0.59411765
 0.62184874 0.61848739 0.61764706 0.6092437 ]

Mean Score: 0.6


In [11]:
### Build Random Forest
from sklearn.ensemble import RandomForestClassifier
FOREST = RandomForestClassifier( n_estimators = 100)
FOREST.fit( X_train, y_train )
FOREST_prediction = FOREST.predict( X_test )

FOREST_matrix = confusion_matrix( y_test, FOREST_prediction )
FOREST_accuracy = accuracy_score( y_test, FOREST_prediction )
print(f"Random Forest\n{FOREST_matrix}\n\nAccuracy: {FOREST_accuracy:.2}\n")

FOREST_score = cross_val_score(estimator = FOREST, y = y_test, X = X_test, cv = 10 )
print(f"\nScore: {FOREST_score}\n\nMean Score: {FOREST_score.mean():.2}")

Random Forest
[[1749  118   76  169]
 [  48 3282   73  126]
 [  65  128 2739  144]
 [  88  128   64 2907]]

Accuracy: 0.9


Score: [0.68261965 0.68513854 0.6759026  0.69269521 0.67478992 0.71176471
 0.67983193 0.68067227 0.69495798 0.69327731]

Mean Score: 0.69


In [12]:
### Build Naive Bayes
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit( X_train, y_train )
NB_prediction = NB.predict( X_test )

NB_matrix = confusion_matrix( y_test, NB_prediction )
NB_accuracy = accuracy_score( y_test, NB_prediction )
print(f"Naive Bayes\n{NB_matrix}\n\nAccuracy: {NB_accuracy:.2}\n")

NB_score = cross_val_score(estimator = NB, y = y_test, X = X_test, cv = 10 )
print(f"\nScore: {NB_score}\n\nMean Score: {NB_score.mean():.2}")

Naive Bayes
[[1896   59   30  127]
 [1656 1265  174  434]
 [1547  259  754  516]
 [1815  150  117 1105]]

Accuracy: 0.42


Score: [0.37531486 0.38371117 0.38371117 0.37783375 0.39495798 0.38151261
 0.39747899 0.38319328 0.38739496 0.40084034]

Mean Score: 0.39


In [13]:
### Build K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier( n_neighbors = 5 )
KNN.fit( X_train, y_train )
KNN_prediction = KNN.predict( X_test )

KNN_matrix = confusion_matrix( y_test, KNN_prediction )
KNN_accuracy = accuracy_score( y_test, KNN_prediction )
print(f"K-Nearest Neighbors\n{KNN_matrix}\n\nAccuracy: {KNN_accuracy:.2}\n")

KNN_score = cross_val_score(estimator = KNN, y = y_test, X = X_test, cv = 10 )
print(f"\nScore: {KNN_score}\n\nMean Score: {KNN_score.mean():.2}")

K-Nearest Neighbors
[[1723  157   70  162]
 [ 157 3180   56  136]
 [ 148  278 2505  145]
 [ 177  263  142 2605]]

Accuracy: 0.84


Score: [0.48530647 0.48782536 0.50713686 0.52980688 0.51596639 0.54285714
 0.53697479 0.50588235 0.51428571 0.53109244]

Mean Score: 0.52


In [14]:
### Build Support Vector Machine
from sklearn.svm import SVC
SVC = SVC( kernel = "rbf" )
SVC.fit( X_train, y_train )
SVC_prediction = SVC.predict( X_test )

SVC_matrix = confusion_matrix( y_test, SVC_prediction )
SVC_accuracy = accuracy_score( y_test, SVC_prediction )
print(f"SVM\n{SVC_matrix}\n\nAccuracy: {SVC_accuracy:.2}\n")

SVC_score = cross_val_score(estimator = SVC, y = y_test, X = X_test, cv = 10 )
print(f"Score: {SVC_score}\n\nMean Score: {SVC_score.mean():.2}")

SVM
[[1466  246   98  302]
 [  45 3256   72  156]
 [  88  258 2433  297]
 [  84  243  108 2752]]

Accuracy: 0.83

Score: [0.65910999 0.65910999 0.63643997 0.66246851 0.6394958  0.69159664
 0.67142857 0.67226891 0.65546218 0.66806723]

Mean Score: 0.66
