In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import precision_score, recall_score, f1_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Import dataset
path_training = "./twitter_training.csv"
path_validation = "./twitter_validation.csv"
colnames = ["Tweet_ID","Entity","Sentiment","Tweet_Text"]

training   = pd.read_csv(path_training, names = colnames, header= None)
validation = pd.read_csv(path_validation, names = colnames, header= None)

# pre-process data
training   = training.dropna()
training   = training[training["Sentiment"] != "Irrelevant"]
training   = training[training["Sentiment"] != "Neutral"]
validation = validation.dropna()
validation   = validation[validation["Sentiment"] != "Irrelevant"]
validation   = validation[validation["Sentiment"] != "Neutral"]

sentiment = {'Positive':1,'Negative':0}

training.Sentiment = [sentiment[item] for item in training.Sentiment]
validation.Sentiment = [sentiment[item] for item in validation.Sentiment]

In [3]:
# Creating dataframe for the results
results = pd.DataFrame(columns=["Type","Precision","Recall","F1_score"])

In [4]:
# Checking for bias in dataset
training["Sentiment"].value_counts()

0    22358
1    20655
Name: Sentiment, dtype: int64

In [5]:
# Baseline

# vectorize data
vectorizer = TfidfVectorizer(lowercase= False)
X_train    = vectorizer.fit_transform(training['Tweet_Text'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_base = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "Baseline",
                "Precision" : precision_score(y_validate, Predictions_base),
                "Recall"    : recall_score(y_validate, Predictions_base),
                "F1_score"  : f1_score(y_validate, Predictions_base)},
              ignore_index=True)

In [6]:
# LowerCase

# vectorize data
vectorizer = TfidfVectorizer(lowercase= True)
X_train    = vectorizer.fit_transform(training['Tweet_Text'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_Lower_Case = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "LowerCase",
                "Precision" : precision_score(y_validate, Predictions_Lower_Case),
                "Recall"    : recall_score(y_validate, Predictions_Lower_Case),
                "F1_score"  : f1_score(y_validate, Predictions_Lower_Case)},
              ignore_index=True)

In [7]:
# Stopwords

# List of English stop words to be removed
lst_stopwords = nltk.corpus.stopwords.words('english')

# vectorize data
vectorizer = TfidfVectorizer(lowercase= False, stop_words= lst_stopwords)
X_train    = vectorizer.fit_transform(training['Tweet_Text'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_Stop_Words = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "StopWords",
                "Precision" : precision_score(y_validate, Predictions_Stop_Words),
                "Recall"    : recall_score(y_validate, Predictions_Stop_Words),
                "F1_score"  : f1_score(y_validate, Predictions_Stop_Words)},
              ignore_index=True)

In [8]:
## Stemming 
ps = nltk.stem.porter.PorterStemmer()

def stemming(text):
    lst_text = text.split()

    lst_text = [ps.stem(word) for word in lst_text]
    text = " ".join(lst_text)
    return text

training["Tweet_Text_stem"] = training["Tweet_Text"].apply(lambda x: stemming(x))
validation["Tweet_Text_stem"] = validation["Tweet_Text"].apply(lambda x: stemming(x))

# vectorize data
vectorizer = TfidfVectorizer(lowercase= False)
X_train    = vectorizer.fit_transform(training['Tweet_Text_stem'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text_stem'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_lemma = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "Stemming",
                "Precision" : precision_score(y_validate, Predictions_lemma),
                "Recall"    : recall_score(y_validate, Predictions_lemma),
                "F1_score"  : f1_score(y_validate, Predictions_lemma)},
              ignore_index=True)

In [9]:
## Lowercase + stopwords

training["Tweet_Text_stem"] = training["Tweet_Text"].apply(lambda x: stemming(x))
validation["Tweet_Text_stem"] = validation["Tweet_Text"].apply(lambda x: stemming(x))

# vectorize data
vectorizer = TfidfVectorizer(stop_words= lst_stopwords)
X_train    = vectorizer.fit_transform(training['Tweet_Text_stem'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text_stem'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_lemma = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "LowerCase X Stopwords",
                "Precision" : precision_score(y_validate, Predictions_lemma),
                "Recall"    : recall_score(y_validate, Predictions_lemma),
                "F1_score"  : f1_score(y_validate, Predictions_lemma)},
              ignore_index=True)

In [10]:
## LowerCase + Stemming

def stemming(text):
    lst_text = text.split()
    lst_text = [ps.stem(word) for word in lst_text]
    text = " ".join(lst_text)
    return text

training["Tweet_Text_stem"] = training["Tweet_Text"].apply(lambda x: stemming(x))
validation["Tweet_Text_stem"] = validation["Tweet_Text"].apply(lambda x: stemming(x))

# vectorize data
vectorizer = TfidfVectorizer()
X_train    = vectorizer.fit_transform(training['Tweet_Text_stem'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text_stem'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_lemma = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "LowerCase X Stemming",
                "Precision" : precision_score(y_validate, Predictions_lemma),
                "Recall"    : recall_score(y_validate, Predictions_lemma),
                "F1_score"  : f1_score(y_validate, Predictions_lemma)},
              ignore_index=True)

In [11]:
## Stopword + Stemming

def stemming(text):
    lst_text = text.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
        
    lst_text = [ps.stem(word) for word in lst_text]
    text = " ".join(lst_text)
    return text

training["Tweet_Text_stem"] = training["Tweet_Text"].apply(lambda x: stemming(x))
validation["Tweet_Text_stem"] = validation["Tweet_Text"].apply(lambda x: stemming(x))

# vectorize data
vectorizer = TfidfVectorizer(lowercase= False)
X_train    = vectorizer.fit_transform(training['Tweet_Text_stem'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text_stem'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_lemma = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "Stopwords X Stemming",
                "Precision" : precision_score(y_validate, Predictions_lemma),
                "Recall"    : recall_score(y_validate, Predictions_lemma),
                "F1_score"  : f1_score(y_validate, Predictions_lemma)},
              ignore_index=True)

In [12]:
## Stopwords + Stemming + Lowercase

def stemming(text):
    lst_text = text.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
        
    lst_text = [ps.stem(word) for word in lst_text]
    text = " ".join(lst_text)
    return text

training["Tweet_Text_stem"] = training["Tweet_Text"].apply(lambda x: stemming(x))
validation["Tweet_Text_stem"] = validation["Tweet_Text"].apply(lambda x: stemming(x))

# vectorize data
vectorizer = TfidfVectorizer()
X_train    = vectorizer.fit_transform(training['Tweet_Text_stem'])
X_validate = vectorizer.fit_transform(validation['Tweet_Text_stem'])

# Select target value
y_train    = training["Sentiment"]
y_validate = validation["Sentiment"]

# Select 10 best features based on chi squared test.
chi_square = SelectKBest(chi2)

X_train    = chi_square.fit_transform(X_train, y_train)
X_validate = chi_square.fit_transform(X_validate, y_validate)

# Apply RandomForestClassifier on features
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
Predictions_lemma = clf.predict(X_validate)

# Add Results to DataFrame with all outcomes 
results = results.append({"Type"      : "LowerCase X StopWords X Stemming",
                "Precision" : precision_score(y_validate, Predictions_lemma),
                "Recall"    : recall_score(y_validate, Predictions_lemma),
                "F1_score"  : f1_score(y_validate, Predictions_lemma)},
              ignore_index=True)

In [13]:
results

Unnamed: 0,Type,Precision,Recall,F1_score
0,Baseline,0.607595,0.173285,0.269663
1,LowerCase,0.659794,0.231047,0.342246
2,StopWords,0.634146,0.187726,0.289694
3,Stemming,0.637931,0.267148,0.37659
4,LowerCase X Stopwords,0.596154,0.223827,0.325459
5,LowerCase X Stemming,0.637931,0.267148,0.37659
6,Stopwords X Stemming,0.66,0.238267,0.350133
7,LowerCase X StopWords X Stemming,0.66,0.238267,0.350133
