In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [19]:
file_path = '../data/reddit_train.csv'
file_path2 = '../data/reddit_test.csv'
data = pd.read_csv(file_path)
data = data.drop(columns={'id'})
data.tail()
test_data = pd.read_csv(file_path2)
test_data.tail()

data['sub_id'] = data['subreddits'].factorize()[0]
sub_id_df = data[['subreddits',
                 'sub_id']].drop_duplicates().sort_values('sub_id')
sub_to_id = dict(sub_id_df.values)
id_to_sub = dict(sub_id_df[['sub_id', 'subreddits']].values)

print(data.tail(), '\n')
print(data['comments'].apply(lambda x: len(x.split(' '))).sum())

                                                comments       subreddits  \
69995  Thank you, you confirm Spain does have nice pe...           europe   
69996  Imagine how many he would have killed with a r...  leagueoflegends   
69997  Yes. Only. As in the guy I was replying to was...           canada   
69998  Looking for something light-hearted or has a v...            anime   
69999  I love how I never cry about casters because I...  GlobalOffensive   

       sub_id  
69995      16  
69996       2  
69997      17  
69998       6  
69999       9   

2968210


In [20]:
def clean_data(s):
    for expr in [r"</d>",r"</s>",r"[^A-Za-z0-9(),!?\'\`]"]:
        s = re.sub(expr, " ", s)
    for expr in [r"\'s",r"\'ve",r"\'t",r"\'re",r"\'d",r"\'11",]:
        s = re.sub(expr, " "+expr[1:], s)
    for expr in [r",",r"!",r"\(",r"\)"r"\?"]:
        s = re.sub(expr, " "+expr[1:]+" ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r'\S*(x{2,}|X{2,})\S*', "xxx", s)
    s = re.sub(r'[^\x00-\x7F]+', "", s)
    return s.strip().lower()

In [21]:
data['comments'] = data['comments'].apply(lambda x: clean_data(x))
test_data['comments'] = data['comments'].apply(lambda x: clean_data(x))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(data.comments, data.subreddits, test_size=0.2, random_state=42)

In [23]:
%%time
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
test_counts = count_vect.transform(X_test)
tf_idf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',
                       encoding='latin-1', ngram_range=(1,2), stop_words='english')
features = tf_idf_vectorizer.fit_transform(X_train).toarray()
vectors_test_idf = tf_idf_vectorizer.transform(X_test)
subreddits = data['subreddits'].unique()
labels = data.sub_id[:len(X_train)]
features.shape

CPU times: user 6.59 s, sys: 1.03 s, total: 7.62 s
Wall time: 7.62 s


(56000, 31415)

In [25]:
%%time
from sklearn.feature_selection import chi2

N = 2
for comment, sub_id in sorted(sub_to_id.items()):
    features_chi2 = chi2(features, labels==sub_id)
    indices = np.argsort(features_chi2[0])
    features_names = np.array(tf_idf_vectorizer.get_feature_names())[indices]
    unigrams = [v for v in features_names if len(v.split(' ')) == 1]
    bigrams = [v for v in features_names if len(v.split(' ')) == 2]

CPU times: user 8min 36s, sys: 58.5 s, total: 9min 35s
Wall time: 1min 21s


In [35]:
%%time 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics



from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
rf.fit(features, y_train)
rf_y_pred = rf.predict(vectors_test_idf)
print(metrics.accuracy_score(y_test, rf_y_pred))


0.24971428571428572
CPU times: user 49.3 s, sys: 13 s, total: 1min 2s
Wall time: 2min 10s


In [34]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0)]
CV =3 
cv_df = pd.DataFrame(index=range(CV*len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    print(model_name)
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
        
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

import seaborn as sns

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
            size=8, jitter=True, edgecolor="gray", linewidth=2)

plt.show()

RandomForestClassifier


KeyboardInterrupt: 