# Import required libraries

In [14]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# Load data and perform preprocess function

In [15]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)
data.rename(columns={0:'target', 1:'id', 2:'date', 3:'flag', 4:'user', 5:'text'}, inplace=True)
data.head(10)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [16]:
data.drop(['id', 'date', 'flag', 'user'], axis=1, inplace=True)
data.head(10)

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
5,0,@Kwesidei not the whole crew
6,0,Need a hug
7,0,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,@Tatiana_K nope they didn't have it
9,0,@twittera que me muera ?


In [17]:
data['target'] = data['target'].replace({0: 'Negative', 2: 'Neutral', 4: 'Positive'})
data = data[data['target'].isin(['Positive', 'Negative'])]
data.reset_index(drop=True, inplace=True)

stop_words = set(stopwords.words('english'))



In [18]:
stemmer = PorterStemmer()
def preprocess(text):
    text = text.lower()
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text if word.isalpha()]
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

data['text'] = data['text'].apply(preprocess)

In [23]:
data.head(10)

data.describe()

Unnamed: 0,target,text
count,1600000,1600000.0
unique,2,1529546.0
top,Negative,
freq,800000,2498.0


# Split data into training sets

In [36]:
X = data['text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Vectorize the data

In [39]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train_trans = vectorizer.transform(X_train)
X_test_trans = vectorizer.transform(X_test)

# Train and Evaluate the model

In [40]:
trainer = MultinomialNB()
trainer.fit(X_train_trans, y_train)
y_pred = trainer.predict(X_test_trans)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.74      0.79      0.77    239361
    Positive       0.78      0.73      0.75    240639

    accuracy                           0.76    480000
   macro avg       0.76      0.76      0.76    480000
weighted avg       0.76      0.76      0.76    480000

