In [1]:
import pandas as pd
import numpy as np
import re
import nltk

import sklearn 
from sklearn import feature_extraction

from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/mrandl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mrandl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mrandl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def readFileOfTweets(path):
    ret = None
    with open(path, "r") as f:
        ret = f.read().splitlines()
    return ret

In [3]:
negStrings = readFileOfTweets('data/train_neg_full.txt')
negDf = pd.DataFrame(negStrings, columns=['text'])
negDf['y'] = 0

posStrings = readFileOfTweets('data/train_pos_full.txt')
posDf = pd.DataFrame(posStrings, columns=['text'])
posDf['y'] = 1

In [4]:
allDf = pd.concat([negDf, posDf], ignore_index = True)
allDf

Unnamed: 0,text,y
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0
...,...,...
2499995,a warning sign ? (; rt <user> the negativity y...,1
2499996,<user> ff too thank youuu ) ),1
2499997,i just love shumpa ! that's my girl,1
2499998,the best way to start a day ! no matter what h...,1


In [5]:
assert(len(allDf) == len(negDf) + len(posDf))

In [6]:
allDf.sample(5)

Unnamed: 0,text,y
1260691,<user> gotta go to court at 8: 15 .. lala #bad...,1
90100,<user> i waaanaaa be with stormy waaah . well ...,0
575892,<user> i'll bring my 1936 rickenbacker lap ste...,0
1526095,<user> she sound sexy,1
1057468,<user> it's okay except hyuna has no energy . ...,0


In [7]:
lem = nltk.stem.wordnet.WordNetLemmatizer()
nltk_stopwords = nltk.corpus.stopwords.words("english")

def preprocess_line(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = text.split()
    text = [word for word in text if word not in nltk_stopwords]
    text = [lem.lemmatize(word) for word in text]
    return " ".join(text)

In [8]:
allDf["text_clean"] = allDf["text"].apply(lambda x: preprocess_line(x))
allDf.sample(5)

Unnamed: 0,text,y,text_clean
1459961,why can't you just tell me how you feel !,1,cant tell feel
1054922,so annoyed that i cant make a pizza in foodtec...,0,annoyed cant make pizza foodtech today f
2167016,<user> you know what i do,1,user know
2334251,soo i can't believe that the lady from atf cal...,1,soo cant believe lady atf called missionary tr...
936888,i was there rt <user> <user> <user> nahhh chil...,0,rt user user user nahhh chill hungry


In [9]:
def train_and_pred(x_train, y_train, x_test):
    vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
    x_vecs = vectorizer.fit_transform(x_train)
    classifier = LinearSVC()#RidgeClassifier()
    classifier.fit(x_vecs, y_train)

    return classifier.predict(vectorizer.transform(x_test))

def crossValidate(dataset, fold):
    accuracy = 0
    kf = KFold(n_splits=fold, shuffle = True)
    
    for train_index, test_index in kf.split(dataset):
        train = dataset.iloc[train_index]
        test  = dataset.iloc[test_index]

        train_x = train.text_clean
        train_y = train.y
        test_x  = test.text_clean
        test_y  = test.y
        
        pred = train_and_pred(train_x, train_y, test_x)
        
        accuracy += accuracy_score(test_y, pred, normalize=False)
        print(accuracy_score(test_y, pred))
        
    print("avg fold accuracy : ", accuracy / len(dataset))



In [10]:
crossValidate(allDf, 5)

0.836656
0.837052
0.835902
0.836616
0.836818
avg fold accuracy :  0.8366088


In [11]:
def readTestFile():
    with open("data/test_data.txt", "r") as f:
        content = f.readlines()
    content = [re.split(",", x, maxsplit=1)[1] for x in content]
    return content

def publishResults(test_pred, file_name):
    with open('data/' + file_name, "w") as f:
        f.write("Id,Prediction\n")
        for pred, index in zip(test_pred, range(1, len(test_pred) +1)):
            f.write(str(index) + "," + str(pred) + "\n")

In [12]:
test = [preprocess_line(x) for x in readTestFile()]

In [13]:
test[:5]

['sea doo pro sea scooter sport portable seadoo seascootersave air stay longer water url',
 'user shuck well work week cant come cheer oh put battery calculator',
 'cant stay away bug thats baby',
 'user maam lol im perfectly fine contagious anymore lmao',
 'whenever fall asleep watching tv always wake headache']

In [14]:
test_pred = train_and_pred(allDf.text_clean, allDf.y, test)

In [15]:
test_pred = [-1 if x == 0 else 1 for x in test_pred]

In [16]:
publishResults(test_pred, "tfidf_svc.csv")