In [1]:
import pandas as pd
import numpy as np
import re
import nltk

import sklearn 
from sklearn import feature_extraction

from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/mrandl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mrandl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mrandl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def readFileOfTweets(path):
    ret = None
    with open(path, "r") as f:
        ret = f.read().splitlines()
    return ret

In [3]:
negStrings = readFileOfTweets('data/train_neg_full.txt')
negDf = pd.DataFrame(negStrings, columns=['text'])
negDf['y'] = 0

posStrings = readFileOfTweets('data/train_pos_full.txt')
posDf = pd.DataFrame(posStrings, columns=['text'])
posDf['y'] = 1

In [4]:
allDf = pd.concat([negDf, posDf], ignore_index = True)
allDf

Unnamed: 0,text,y
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0
...,...,...
2499995,a warning sign ? (; rt <user> the negativity y...,1
2499996,<user> ff too thank youuu ) ),1
2499997,i just love shumpa ! that's my girl,1
2499998,the best way to start a day ! no matter what h...,1


In [5]:
assert(len(allDf) == len(negDf) + len(posDf))

In [6]:
allDf.sample(5)

Unnamed: 0,text,y
1637396,<user> i didn't throw the first one,1
2030611,i love waking up next to this boy <url>,1
474022,bruises on my fingers from my guitar <url>,0
1191337,sunday is hbo's new ladies night with ' girls ...,0
2426233,"<user> welcome back to indonesia , greyson : 3...",1


In [7]:
lem = nltk.stem.wordnet.WordNetLemmatizer()
nltk_stopwords = nltk.corpus.stopwords.words("english")

def preprocess_line(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = text.split()
    text = [word for word in text if word not in nltk_stopwords]
    text = [lem.lemmatize(word) for word in text]
    return " ".join(text)

In [8]:
allDf["text_clean"] = allDf["text"].apply(lambda x: preprocess_line(x))
allDf.sample(5)

Unnamed: 0,text,y,text_clean
650975,<user> i hope they get better poor thing ;,0,user hope get better poor thing
2042936,well that's everyone fed for the night and i'm...,1,well thats everyone fed night im starting look...
1226932,stomach cramps are the worst ..,0,stomach cramp worst
1741259,"every time i see someone check in at "" bed x ""...",1,every time see someone check bed x die little ...
1957042,<user> <user> 7:45 kick off mate . perfect tim...,1,user user 745 kick mate perfect timing j4k ses...


In [11]:
def train_and_pred(x_train, y_train, x_test):
    vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
    x_vecs = vectorizer.fit_transform(x_train)
    classifier = RandomForestClassifier(n_jobs=-1, verbose = True, n_estimators = 1000)
    classifier.fit(x_vecs, y_train)

    return classifier.predict(vectorizer.transform(x_test))

def crossValidate(dataset, fold):
    accuracy = 0
    kf = KFold(n_splits=fold, shuffle = True)
    
    for train_index, test_index in kf.split(dataset):
        train = dataset.iloc[train_index]
        test  = dataset.iloc[test_index]

        train_x = train.text_clean
        train_y = train.y
        test_x  = test.text_clean
        test_y  = test.y
        
        pred = train_and_pred(train_x, train_y, test_x)
        
        accuracy += accuracy_score(test_y, pred, normalize=False)
        print(accuracy_score(test_y, pred))
        
    print("avg fold accuracy : ", accuracy / len(dataset))



In [None]:
crossValidate(allDf, 5)

In [None]:
def readTestFile():
    with open("data/test_data.txt", "r") as f:
        content = f.readlines()
    content = [re.split(",", x, maxsplit=1)[1] for x in content]
    return content

def publishResults(test_pred, file_name):
    with open('data/' + file_name, "w") as f:
        f.write("Id,Prediction\n")
        for pred, index in zip(test_pred, range(1, len(test_pred) +1)):
            f.write(str(index) + "," + str(pred) + "\n")

In [None]:
test = [preprocess_line(x) for x in readTestFile()]

In [None]:
test[:5]

In [None]:
test_pred = train_and_pred(allDf.text_clean, allDf.y, test)

In [None]:
test_pred = [-1 if x == 0 else 1 for x in test_pred]

In [None]:
publishResults(test_pred, "tfidf_rft.csv")