# Preprocessing dataset

In [1]:
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from tqdm import tqdm_notebook as tqdm
from pathlib import Path


def check_file_exist(file_path):
    my_file = Path(file_path)
    if my_file.is_file():
        return True
    else:
        return False


def read_data():
    train = pd.read_fwf(data_folder + "train_tweets.txt", infer_nrows=150, sep="\t", header=None,
                        names=["UID", "Twitter"])
    train = train.dropna()
    test = pd.read_fwf(data_folder + "test_tweets_unlabeled.txt", infer_nrows=150, header=None, names=["Twitter"])
    return train, test


def regularize(text, tk):
    tokens = tk.tokenize(text)
    for i,token in enumerate(tokens):
        if token.find("http") != -1:
            token = re.sub(r"http://", "{",token)
            token = re.sub(r"/[\w./-]+", "}", token)
            tokens[i]=token
    return tokens


def stemming(tokens, stemmer):
    return [stemmer.stem(token).lower() for token in tokens]


def pre_processing(df, description):
    stemmed=[]
    tk = TweetTokenizer()
    stemmer = PorterStemmer()
    print("Start normalizing {} set".format(description))
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        tokens = regularize(row["Twitter"], tk)
        stemmed.append("\x01".join(stemming(tokens, stemmer)))
    del df["Twitter"]
    df["Twitter"] = stemmed
    df.to_csv(data_folder + "{}_norm.csv".format(description),index=False)
    return df

# Specify data folder
data_folder = "/home/zlp/data/SML/"

train, test = read_data()
if check_file_exist(data_folder+"train_norm.csv"):
    print("tain_norm.txt exists. Will load it")
    train_norm = pd.read_csv(data_folder+"train_norm.csv")
else:
    train_norm = pre_processing(train, "train")
if check_file_exist(data_folder+"test_norm.csv"):
    print("test_norm.txt exists. Will load it")
    test_norm = pd.read_csv(data_folder+"test_norm.csv")
else:
    test_norm = pre_processing(test, "test")


tain_norm.txt exists. Will load it
test_norm.txt exists. Will load it


# Vectorize data using BOW

In [2]:
from sklearn.feature_extraction import DictVectorizer

def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW

def prepare_data(feature_extractor):
    training_set = []
    test_set = []
    training_classifications = []
    for _, row in tqdm(train_norm.iterrows(),total = train_norm.shape[0]):
        feature_dict = feature_extractor(row["Twitter"].split("\x01"))   
        training_set.append(feature_dict)
        training_classifications.append(row["UID"])
    for _, row in tqdm(test_norm.iterrows(),total = test_norm.shape[0]):
        features = feature_extractor(row["Twitter"].split("\x01"))
        test_set.append(features)
    vectorizer = DictVectorizer()
    training_data = vectorizer.fit_transform(training_set)
    test_data = vectorizer.transform(test_set)
    return training_data,training_classifications,test_data

trn_data,trn_classes,test_data = prepare_data(get_BOW)

HBox(children=(IntProgress(value=0, max=328931), HTML(value='')))




HBox(children=(IntProgress(value=0, max=35437), HTML(value='')))




In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

clfs = [KNeighborsClassifier(n_jobs=-1),DecisionTreeClassifier(),RandomForestClassifier(n_jobs=-1, n_estimators=50),
        MultinomialNB(),LinearSVC(),LogisticRegression(n_jobs=-1)]


# Just for comparing the perforamnce of different models
High memory requirement

In [4]:
from sklearn import model_selection
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from time import time

def do_multiple_10foldcrossvalidation(clfs,data,classifications):
    for clf in clfs:
        s_time = time()
        predictions = model_selection.cross_val_predict(clf, data, classifications, cv=10)
        print (clf)
        print ("accuracy")
        print (accuracy_score(classifications,predictions))
        print (classification_report(classifications,predictions))
        print("time cost:{}".format(time() - s_time))

#do_multiple_10foldcrossvalidation(clfs,trn_data,trn_classes)

# train and predict

In [None]:
import pickle


def save_predicted(predicted, index):
    output = [(i+1,pred) for i,pred in enumerate(predicted)]
    out_df = pd.DataFrame(output, columns=["Id", "Predicted"]).set_index("Id")
    out_df.to_csv(data_folder + "predicted_{}.csv".format(index))

def fit_predict(clfs, indexs, data, classifications):
    for i in indexs:
        s_time = time()
        clfs[i].fit(data, classifications)
        save_predicted(clfs[i].predict(test_data), i)
        with open('trained_{}.pkl'.format(i), 'wb') as fid:
            pickle.dump(clfs[i], fid, protocol=4)
        print("time cost:{}".format(time() - s_time))
    
def load_predict(indexs):
    for i in indexs:
        with open('trained_{}.pkl'.format(i), 'rb') as fid:
            model = pickle.load(fid)
        save_predicted(model.predict(test_data),i)

models_index = list(range(4,5))
#fit_predict(clfs, models_index, trn_data, trn_classes)
load_predict(models_index)

time cost:8655.274057388306


  " = {}.".format(effective_n_jobs(self.n_jobs)))
