In [97]:
import pandas as pd

In [98]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding= 'latin-1',header = None)  ### Dataset => https://www.kaggle.com/kazanova/sentiment140
df = df.sample(frac = 1) ### shuffle the data
df = df.rename(columns={0: 'target', 1: 'id', 2: 'date', 3: 'query', 4: 'username', 5: 'content'}) # add names for columns
df = df.drop(['id','date','query','username'],axis=1) # drop unimportant columns
df.head(5)

In [99]:
### in target column 0 is unhappy and 4 is happy ###
### here replaced 4 with 1 just to make more sense ###
df.target = df.target.replace({4:1}) 
df.head()

In [100]:
df.groupby('target').describe()

## Text Preprocessing

In [102]:
# nltk
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
#Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”)
stemmer = SnowballStemmer("english")

nltk.download("stopwords")
stopwords = set(stopwords.words("english"))
print(stopwords)

In [106]:
# Note that this process will take alot of time +12 hours on CPU so you can use GPU to speed up the process
for i in df.index:
    tweet = df["content"][i]
    tweet = tweet.split(" ")
    for index, word in enumerate(tweet):
        if word.startswith("@") or word.startswith("http") or len(word)<2:
            tweet[index] = ""
            continue
        word = word.lower()
        word = stemmer.stem(word)
        tweet[index] = word
    df["content"][i] = ' '.join(tweet)
df.head(10)

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words=stopwords, ngram_range = (1,1), max_features=300, tokenizer = token.tokenize)
x = cv.fit_transform(df.content)

## Train Test Split

In [109]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,df.target)

## Naive Bayes Models

In [110]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(xtrain, ytrain)
print("test score :",model.score(xtest, ytest))
print("train score :",model.score(xtrain, ytrain))

In [111]:
from sklearn.naive_bayes import ComplementNB

cnb = ComplementNB()
cnb.fit(xtrain, ytrain)
print("test score :",cnb.score(xtest, ytest))
print("train score :",cnb.score(xtrain, ytrain))

In [112]:
from sklearn.naive_bayes import BernoulliNB

BNBmodel = BernoulliNB(alpha = 2)
BNBmodel.fit(xtrain, ytrain)
print("test score :",BNBmodel.score(xtest, ytest))
print("train score :",BNBmodel.score(xtrain, ytrain))

## SVM Model

In [113]:
from sklearn import svm
SVMmodel= svm.SVC()
SVMmodel.fit(xtrain, ytrain)
print("test score :",SVMmodel.score(xtest, ytest))
print("train score :",SVMmodel.score(xtrain, ytrain))

## Decision Tree Model

In [114]:
from  sklearn.tree import DecisionTreeClassifier
DTmodel = DecisionTreeClassifier()
DTmodel.fit(xtrain, ytrain)
print("test score :",DTmodel.score(xtest, ytest))
print("train score :",DTmodel.score(xtrain, ytrain))

## Logistic Regression Model

In [115]:
from sklearn.linear_model import LogisticRegression
LRmodel = LogisticRegression()
LRmodel.fit(xtrain, ytrain)
print("test score :",LRmodel.score(xtest, ytest))
print("train score :",LRmodel.score(xtrain, ytrain))