# Imports

In [56]:
import re

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [58]:
import nltk
from nltk.corpus import stopwords

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [1]:
TRAIN_PORTION = 0.9

# Loading Dataset

In [61]:
dataset_path=r"E:\\training.1600000.processed.noemoticon.csv"
df = pd.read_csv(dataset_path, encoding="ISO-8859-1", names=["target", "ids", "date", "flag", "user", "text"])

In [62]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# preprocessing for label

## -1 as negative, 0 as natural and 1 as positive

In [63]:
decode_map = {0: -1, 2: 0, 4: 1}
df.target = df.target.apply(lambda x: decode_map[x])

In [64]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,-1,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,-1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,-1,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,-1,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,-1,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [66]:
df.target.value_counts()

-1    800000
 1    800000
Name: target, dtype: int64

# Clean Data

In [67]:
def filter_stopwords(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    return text

In [68]:
df.text = df.text.apply(filter_stopwords)

In [69]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,-1,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,switchfoot http twitpic com 2y1zl Awww that s...
1,-1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can t update his Facebook by ...
2,-1,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,Kenichan I dived many times for the ball Mana...
3,-1,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,-1,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass no it s not behaving at all i...


In [70]:
vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(df.text)

In [71]:
len(vectorizer.get_feature_names())

656463

# Split train and test data

In [72]:
sample_index = np.random.random(df.shape[0])
X_train, X_test = word_frequency[sample_index <= TRAIN_PORTION, :], word_frequency[sample_index > TRAIN_PORTION, :]
Y_train, Y_test = df.target[sample_index <= TRAIN_PORTION], df.target[sample_index > TRAIN_PORTION]
print(X_train.shape,Y_train.shape)
print(X_test.shape, Y_test.shape)

((1440226, 656463), (1440226L,))
((159774, 656463), (159774L,))


In [73]:
clf = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial').fit(X_train, Y_train)

In [74]:
Y_predit = clf.predict(X_test)

# Accuracy

In [75]:
accuracy_score(y_true=Y_test, y_pred=Y_predit)

0.80298421520397567