In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import json

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk import word_tokenize

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [2]:
submissions = []

with open('submissions_CasualConversation_all_months_filtered.json', 'r') as save_file:
    saved_lines = save_file.readlines()
    for line in saved_lines:
        json_line = json.loads(line)
        submissions.append(json_line)

In [3]:
def split_train_test(submissions,train_set_fraction,randomize=True):
    ''' Function for splitting data in training and test set
    
    =========================== ===============================================
    Attribute                   Description
    =========================== ===============================================
    "data"                      The data that needs to be split up.
    "train_set_fraction"        The fraction of the data that needs to be in 
                                the training set. 
    "randomize"                 Whether the order of the data should be 
                                shuffled.
    =========================== ===============================================
    '''
    data = []
    for submission in submissions:
        text = submission['title'] + '\n' + submission['selftext']
        label = submission['link_flair_text']
        data.append([text,label])
    if(randomize): np.random.shuffle(data)
    trainsize = round(len(data) * train_set_fraction)

    train_set = [entry[0] for entry in data[:trainsize]]
    test_set = [entry[0] for entry in data[trainsize:]]
    train_labels = [entry[1] for entry in data[:trainsize]]
    test_labels = [entry[1] for entry in data[trainsize:]]
    return train_set, test_set, train_labels, test_labels

In [4]:
train_x, test_x, train_y, test_y = split_train_test(submissions,0.8)

In [5]:
x=np.array(train_x)
y=np.array(train_y)

In [6]:
# default params
scoring='accuracy'
cv=3
n_jobs=1
max_features = 2500

In [7]:
class Stemmer(BaseEstimator):
    def __init__(self):
        self.l = PorterStemmer()
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x = map(lambda text:  ' '.join([self.l.stem(word.lower()) for word in text.split()]), x)
        x = np.array(list(x))
        return x

In [8]:
tfidf = TfidfVectorizer(max_features=max_features, tokenizer=word_tokenize, ngram_range=(2, 2), analyzer='word', 
                        stop_words='english')
sm = Stemmer()
lr = LogisticRegression()
p = Pipeline([
    ('sm', sm),
    ('tfidf', tfidf),
    ('lr', lr)
])

cross_val_score(estimator=p, X=x, y=y, scoring=scoring, cv=cv, n_jobs=n_jobs, verbose=1)

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   21.4s finished


array([0.75084175, 0.72727273, 0.71348315])

In [49]:
sm = Stemmer()
tfidf_w = TfidfVectorizer(max_features=max_features, tokenizer=word_tokenize, ngram_range=(1, 5),analyzer='word', 
                          stop_words='english')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char')
lr = LogisticRegression()
p = Pipeline([
    ('sm', sm),
    ('wc_tfidfs', 
         FeatureUnion([
            ('tfidf_w', tfidf_w), 
            ('tfidf_c', tfidf_c), 
         ])
    ),
    ('lr', lr)
])

cross_val_score(estimator=p, X=x, y=y, scoring=scoring, cv=cv, n_jobs=n_jobs, verbose=1)

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   27.1s finished


array([0.90011223, 0.8956229 , 0.89550562])

In [10]:
# predictions = p.predict(X=np.array(test_x))

In [11]:
p = Pipeline([
    ('sm', sm),
    ('tfidf', tfidf),
    ('lr', lr)
])

In [12]:
p.fit(x,y)

Pipeline(memory=None,
     steps=[('sm', Stemmer()), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2500, min_df=1,
        ngram_range=(2, 2), norm='l2', preprocessor=None,...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [13]:
predictions = p.predict(np.array(test_x))

In [14]:
from sklearn.metrics import accuracy_score
y_pred = predictions
y_true = np.array(test_y)
accuracy_score(y_true, y_pred)

0.781437125748503

In [43]:
a = """
I don't like playing  MUSIC song music melody alone. It's very eerie. I can't deal with it, it really freaks me out. The dark, long The near-silent forests. The shadows dancing at the edge of my screen.

It doesn't help my current world has a weird pitch where some of the nearby forests are lit up despite the lack of any light sources.

I can never mute alone for long. When I'm with someone this fear all goes away, of course, but I can't play  alone.

Are there any  that weirdly freak you out?"
"""

In [44]:
p.predict(np.array(["hi this is me as I like dungeons and dragons","What do you guys like watching?",a]))

array(['Music', 'Movie and Show', 'Music'], dtype='<U14')

In [47]:
from xgboost import XGBClassifier

tfidf = TfidfVectorizer(max_features=max_features, tokenizer=word_tokenize, ngram_range=(2, 2), analyzer='word', 
                        stop_words='english')
sm = Stemmer()
xgb = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)
p = Pipeline([
    ('sm', sm),
    ('tfidf', tfidf),
    ('xgb', xgb)
])

cross_val_score(estimator=p, X=x, y=y, scoring=scoring, cv=cv, n_jobs=n_jobs, verbose=1)

  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   34.6s finished


array([0.7620651 , 0.71380471, 0.70561798])