In [1]:
import pandas as pd
import numpy as np
import re
import nltk

import sklearn 
from sklearn import feature_extraction

from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/mrandl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mrandl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mrandl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def readFileOfTweets(path):
    ret = None
    with open(path, "r") as f:
        ret = f.read().splitlines()
    return ret

In [3]:
negStrings = readFileOfTweets('data/train_neg_full.txt')
negDf = pd.DataFrame(negStrings, columns=['text'])
negDf['y'] = 0

posStrings = readFileOfTweets('data/train_pos_full.txt')
posDf = pd.DataFrame(posStrings, columns=['text'])
posDf['y'] = 1

In [4]:
allDf = pd.concat([negDf, posDf], ignore_index = True)
allDf

Unnamed: 0,text,y
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0
...,...,...
2499995,a warning sign ? (; rt <user> the negativity y...,1
2499996,<user> ff too thank youuu ) ),1
2499997,i just love shumpa ! that's my girl,1
2499998,the best way to start a day ! no matter what h...,1


In [5]:
assert(len(allDf) == len(negDf) + len(posDf))

In [6]:
allDf.sample(5)

Unnamed: 0,text,y
315336,i will support harry trough everything even if...,0
991831,i need a ride please,0
1731989,it was so nice to see <user> and <user> yester...,1
2033821,"when i'm with you , i'll make every second count",1
753415,"<user> i'm happy that you made it through , bu...",0


In [7]:
lem = nltk.stem.wordnet.WordNetLemmatizer()
nltk_stopwords = nltk.corpus.stopwords.words("english")

def preprocess_line(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = text.split()
    text = [word for word in text if word not in nltk_stopwords]
    text = [lem.lemmatize(word) for word in text]
    return " ".join(text)

In [8]:
allDf["text_clean"] = allDf["text"].apply(lambda x: preprocess_line(x))
allDf.sample(5)

Unnamed: 0,text,y,text_clean
624240,"<user> is a very lucky girl , hope this is one...",0,user lucky girl hope one relationship last fea...
2161005,fake friends : never ask for food . real frien...,1,fake friend never ask food real friend reason ...
2053000,<user> thnx for the following ! have u heard a...,1,user thnx following u heard singer user amazin...
395677,not only is it my baby birthday but its our an...,0,baby birthday anniversary pain im going
2244487,i understand you're watching the flyers game ....,1,understand youre watching flyer game dont need...


In [9]:
def train_and_pred(x_train, y_train, x_test):
    vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
    x_vecs = vectorizer.fit_transform(x_train)
    classifier = RidgeClassifier(alpha = ridge_param)
    classifier.fit(x_vecs, y_train)

    return classifier.predict(vectorizer.transform(x_test))

def crossValidate(dataset, fold):
    accuracy = 0
    kf = KFold(n_splits=fold, shuffle = True)
    
    for train_index, test_index in kf.split(dataset):
        train = dataset.iloc[train_index]
        test  = dataset.iloc[test_index]

        train_x = train.text_clean
        train_y = train.y
        test_x  = test.text_clean
        test_y  = test.y
        
        pred = train_and_pred(train_x, train_y, test_x)
        
        accuracy += accuracy_score(test_y, pred, normalize=False)
        print(accuracy_score(test_y, pred))
        
    print("avg fold accuracy : ", accuracy / len(dataset))



In [10]:
ridge_param = 0.0
for alpha in [0.1, 0.2, 1.0, 1.5, 2.0, 2.5, 5.0]:
    ridge_param = alpha
    print("=====================================")
    print("alpha = " + str(ridge_param))
    crossValidate(allDf, 5)

alpha = 0.1
0.817018
0.817112
0.817678
0.816764
0.816968
avg fold accuracy :  0.817108
alpha = 0.2
0.825738
0.825002
0.82597
0.824938
0.825198
avg fold accuracy :  0.8253692
alpha = 1.0
0.837918
0.83923
0.83858
0.838572
0.83785
avg fold accuracy :  0.83843
alpha = 1.5
0.839552
0.839136
0.83864
0.840068
0.83951
avg fold accuracy :  0.8393812
alpha = 2.0
0.838954
0.840594
0.838932
0.838682
0.839594
avg fold accuracy :  0.8393512
alpha = 2.5
0.839494
0.839484
0.838518
0.838228
0.839014
avg fold accuracy :  0.8389476
alpha = 5.0
0.836376
0.836496
0.836792
0.836794
0.836526
avg fold accuracy :  0.8365968


In [11]:
def readTestFile():
    with open("data/test_data.txt", "r") as f:
        content = f.readlines()
    content = [re.split(",", x, maxsplit=1)[1] for x in content]
    return content

def publishResults(test_pred, file_name):
    with open('data/' + file_name, "w") as f:
        f.write("Id,Prediction\n")
        for pred, index in zip(test_pred, range(1, len(test_pred) +1)):
            f.write(str(index) + "," + str(pred) + "\n")

In [12]:
test = [preprocess_line(x) for x in readTestFile()]

In [13]:
test[:5]

['sea doo pro sea scooter sport portable seadoo seascootersave air stay longer water url',
 'user shuck well work week cant come cheer oh put battery calculator',
 'cant stay away bug thats baby',
 'user maam lol im perfectly fine contagious anymore lmao',
 'whenever fall asleep watching tv always wake headache']

In [15]:
ridge_param = 1.5 #keep best from cv
test_pred = train_and_pred(allDf.text_clean, allDf.y, test)

In [16]:
test_pred = [-1 if x == 0 else 1 for x in test_pred]

In [17]:
publishResults(test_pred, "tfidf_ridge.csv")