In [1]:
import pandas as pd
import numpy as np
import re
import nltk

import sklearn 
from sklearn import feature_extraction

from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/mrandl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mrandl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mrandl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def readFileOfTweets(path):
    ret = None
    with open(path, "r") as f:
        ret = f.read().splitlines()
    return ret

In [3]:
negStrings = readFileOfTweets('data/train_neg_full.txt')
negDf = pd.DataFrame(negStrings, columns=['text'])
negDf['y'] = 0

posStrings = readFileOfTweets('data/train_pos_full.txt')
posDf = pd.DataFrame(posStrings, columns=['text'])
posDf['y'] = 1

In [4]:
allDf = pd.concat([negDf, posDf], ignore_index = True)
allDf

Unnamed: 0,text,y
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0
...,...,...
2499995,a warning sign ? (; rt <user> the negativity y...,1
2499996,<user> ff too thank youuu ) ),1
2499997,i just love shumpa ! that's my girl,1
2499998,the best way to start a day ! no matter what h...,1


In [5]:
assert(len(allDf) == len(negDf) + len(posDf))

In [6]:
allDf.sample(5)

Unnamed: 0,text,y
740609,i'm gonna love you through it is probably one ...,0
248296,i never dream when i sleep,0
863914,<user> im so sorry thats awful :/,0
922974,eschatology and messianism in lxx isaiah 1-12 ...,0
1257298,<user> i should start wearing snapbacks init,1


In [7]:
lem = nltk.stem.wordnet.WordNetLemmatizer()
nltk_stopwords = nltk.corpus.stopwords.words("english")

def preprocess_line(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = text.split()
    text = [word for word in text if word not in nltk_stopwords]
    text = [lem.lemmatize(word) for word in text]
    return " ".join(text)

In [8]:
allDf["text_clean"] = allDf["text"].apply(lambda x: preprocess_line(x))
allDf.sample(5)

Unnamed: 0,text,y,text_clean
1176359,dawn ultra dishwashing liquid-original scent -...,0,dawn ultra dishwashing liquidoriginal scent 24...
70570,allahu akbar ! may allah forgive us all ... rt...,0,allahu akbar may allah forgive u rt user gonna...
186592,looking at cooking ( plastic comb these kitche...,0,looking cooking plastic comb kitchentested bud...
1846619,my bitches they know,1,bitch know
1544886,"rt <user> dear fat in my body , find your way ...",1,rt user dear fat body find way boob gtfo


In [9]:
def train_and_pred(x_train, y_train, x_test):
    vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
    x_vecs = vectorizer.fit_transform(x_train)
    
    classifier = RidgeClassifier()#LinearSVC()
    classifier.fit(x_vecs, y_train)

    return classifier.predict(vectorizer.transform(x_test))

def crossValidate(dataset, fold):
    accuracy = 0
    kf = KFold(n_splits=fold, shuffle = True)
    
    for train_index, test_index in kf.split(dataset):
        train = dataset.iloc[train_index]
        test  = dataset.iloc[test_index]

        train_x = train.text_clean
        train_y = train.y
        test_x  = test.text_clean
        test_y  = test.y
        
        pred = train_and_pred(train_x, train_y, test_x)
        
        accuracy += accuracy_score(test_y, pred, normalize=False)
        print(accuracy_score(test_y, pred))
        
    print("avg fold accuracy : ", accuracy / len(dataset))



In [10]:
crossValidate(allDf, 5)

0.838034
0.839388
0.838614
0.838218
0.83804
avg fold accuracy :  0.8384588
