Goal:

Sentiment analysis of tweets

In [1]:
### Prepare dataframe
import pandas as pd
csv1 = pd.read_csv( "twitter_training.csv", header = None )
csv2 = pd.read_csv( "twitter_validation.csv", header = None )
dataframe = pd.concat( [csv1, csv2], ignore_index = True )

# Drop useless columns and rows
dataframe = dataframe.drop([0, 1], axis = 1)
dataframe = dataframe.rename(columns = {2 : "Sentiment", 3 : "Tweet"})
dataframe = dataframe.dropna( )

# Reset indicies
dataframe.reset_index(drop = True, inplace = True)

# Reduce dataframe's size to solve memory issue
# dataframe = dataframe.drop( range(30000, len(dataframe)) ) 

In [2]:
### Text Preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stopwords = stopwords.words( "english" )
corpus = []

duplicates = []
already_exists = set()

for i in dataframe.index:
  phrase = dataframe.loc[i, "Tweet"]

  # 1 # Remove non letters
  phrase = re.sub( "[^a-zA-Z]", " ", phrase ).lower()

  # print(f"Original phrase: {phrase}")

  # 2 # Lemmatization
  lemmatizer = WordNetLemmatizer()
  phrase = [lemmatizer.lemmatize(word) for word in phrase.split() if word not in stopwords]
  phrase = " ".join(phrase)

  # print(f"Lemmaitzed phrase: {phrase}")

  # 2 # Remove stopwords and Remove suffixes (Stemming)
  stemmer = PorterStemmer()
  phrase = [stemmer.stem( word ) for word in phrase.split() if word not in stopwords]
  phrase = " ".join(phrase)

  # print(f"Stemmed phrase: {phrase}")

  # 3 # Check if the phrase already exists
  if (phrase in already_exists) or (phrase == "") or (phrase == " "):
  # 3a # Remove duplicates from the dataframe
    dataframe = dataframe.drop( i )

  elif not phrase in already_exists :
    # 3b # Store unique phrase in corpus
    corpus.append( phrase )

    # 3c # Store phrase to keep track of duplicates
    already_exists.add( phrase )
    
  # print("-----")

print(f"Dataset's Length: {len(dataframe)}, Corpus' Length: {len(corpus)}")

Dataset's Length: 59838, Corpus' Length: 59838


In [3]:
# Create Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
features = vectorizer.fit_transform( corpus ).toarray()
label = dataframe["Sentiment"]

In [4]:
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( features, label.values, test_size = 0.2 )

print(len(X_train), len(y_train), len(X_test), len(y_test))
len(X_train[0])

# Prepare evaluation imports
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

47870 47870 11968 11968


In [None]:
### Build Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression( max_iter = 10000 )
LR.fit( X_train, y_train )
LR_prediction = LR.predict( X_test )

LR_matrix = confusion_matrix( y_test, LR_prediction )
LR_accuracy = accuracy_score( y_test, LR_prediction )
print(f"Logistic Regression\n{LR_matrix}\n\nAccuracy: {LR_accuracy:.2}\n")

LR_score = cross_val_score(estimator = LR, y = y_test, X = X_test, cv = 10 )
print(f"\nScore: {LR_score}\n\nMean Score: {LR_score.mean():.2}")

In [None]:
### Build Decision Tree
from sklearn.tree import DecisionTreeClassifier
TREE = DecisionTreeClassifier(  )
TREE.fit( X_train, y_train )
TREE_prediction = TREE.predict( X_test )

TREE_matrix = confusion_matrix( y_test, TREE_prediction )
TREE_accuracy = accuracy_score( y_test, TREE_prediction )
TREE_score = cross_val_score(estimator = TREE, y = y_test, X = X_test, cv = 10 )
print(f"Decision Tree\n{TREE_matrix}\n\nAccuracy: {TREE_accuracy:.2}\n\nScore: {TREE_score}\n\nMean Score: {TREE_score.mean():.2}")

In [None]:
### Build Random Forest
from sklearn.ensemble import RandomForestClassifier
FOREST = RandomForestClassifier( n_estimators = 100)
FOREST.fit( X_train, y_train )
FOREST_prediction = FOREST.predict( X_test )

FOREST_matrix = confusion_matrix( y_test, FOREST_prediction )
FOREST_accuracy = accuracy_score( y_test, FOREST_prediction )
print(f"Random Forest\n{FOREST_matrix}\n\nAccuracy: {FOREST_accuracy:.2}\n")

# FOREST_score = cross_val_score(estimator = FOREST, y = y_test, X = X_test, cv = 10 )
# print(f"\nScore: {FOREST_score}\n\nMean Score: {FOREST_score.mean():.2}")

In [None]:
### Build Support Vector Machine
from sklearn.svm import SVC
SVC = SVC( kernel = "rbf" )
SVC.fit( X_train, y_train )
SVC_prediction = SVC.predict( X_test )

SVC_matrix = confusion_matrix( y_test, SVC_prediction )
SVC_accuracy = accuracy_score( y_test, SVC_prediction )
print(f"SVM\n{SVC_matrix}\n\nAccuracy: {SVC_accuracy:.2}\n")

SVC_score = cross_val_score(estimator = SVC, y = y_test, X = X_test, cv = 2 )
print(f"Score: {SVC_score}\n\nMean Score: {SVC_score.mean():.2}")

In [None]:
### Build Naive Bayes
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit( X_train, y_train )
NB_prediction = NB.predict( X_test )

NB_matrix = confusion_matrix( y_test, NB_prediction )
NB_accuracy = accuracy_score( y_test, NB_prediction )
print(f"Naive Bayes\n{NB_matrix}\n\nAccuracy: {NB_accuracy:.2}\n")

NB_score = cross_val_score(estimator = NB, y = y_test, X = X_test, cv = 10 )
print(f"\nScore: {NB_score}\n\nMean Score: {NB_score.mean():.2}")

In [None]:
### Build K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier( n_neighbors = 5 )
KNN.fit( X_train, y_train )
KNN_prediction = KNN.predict( X_test )

KNN_matrix = confusion_matrix( y_test, KNN_prediction )
KNN_accuracy = accuracy_score( y_test, KNN_prediction )
print(f"K-Nearest Neighbors\n{KNN_matrix}\n\nAccuracy: {KNN_accuracy:.2}\n")

KNN_score = cross_val_score(estimator = KNN, y = y_test, X = X_test, cv = 10 )
print(f"\nScore: {KNN_score}\n\nMean Score: {KNN_score.mean():.2}")