In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
#working with text
from sklearn.feature_extraction.text import TfidfVectorizer
#normalizing data
from sklearn.preprocessing import StandardScaler
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score
#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

In [2]:
df = pd.read_csv("data/sarcasm/train-balanced-sarcasm.csv")
df.head(3)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010826 entries, 0 to 1010825
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   label           1010826 non-null  int64 
 1   comment         1010773 non-null  object
 2   author          1010826 non-null  object
 3   subreddit       1010826 non-null  object
 4   score           1010826 non-null  int64 
 5   ups             1010826 non-null  int64 
 6   downs           1010826 non-null  int64 
 7   date            1010826 non-null  object
 8   created_utc     1010826 non-null  object
 9   parent_comment  1010826 non-null  object
dtypes: int64(4), object(6)
memory usage: 77.1+ MB


In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(df, df['label'], random_state=17)

In [4]:
#save test
X_valid.to_csv("data/sarcasm/X_valid.csv", index=None)
y_valid.to_csv("data/sarcasm/y_valid.csv", index=None)
#save train
X_train.to_csv("data/sarcasm/X_train.csv", index=None)
y_train.to_csv("data/sarcasm/y_train.csv", index=None)

In [5]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [6]:
features = ['comment', 'subreddit', 'parent_comment']
target = 'label'

In [7]:
comment = Pipeline([
                ('imputer', TextImputer('comment', '')),
                ('selector', ColumnSelector(key='comment')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])

In [8]:
%%time

comment.fit(X_train)
comment.transform(X_train)

CPU times: user 14.1 s, sys: 160 ms, total: 14.2 s
Wall time: 14.2 s


<758119x23743 sparse matrix of type '<class 'numpy.float64'>'
	with 6924534 stored elements in Compressed Sparse Row format>

In [9]:
#combine
comment = Pipeline([
                ('imputer', TextImputer('comment', '')),
                ('selector', ColumnSelector(key='comment')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])
subreddit = Pipeline([
                ('imputer', TextImputer('subreddit', '')),
                ('selector', ColumnSelector(key='subreddit')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])
parent_comment = Pipeline([
                ('imputer', TextImputer('parent_comment', '')),
                ('selector', ColumnSelector(key='parent_comment')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])

feats = FeatureUnion([('comment', comment),
                      ('subreddit', subreddit),
                      ('parent_comment', parent_comment)])

In [10]:
%%time

pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression()),
])

pipeline.fit(X_train, y_train)


CPU times: user 2min, sys: 1min 27s, total: 3min 28s
Wall time: 49.7 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('comment',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='comment',
                                                                              value='')),
                                                                 ('selector',
                                                                  ColumnSelector(key='comment')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(max_df=0.9,
                                                                                  min_df=10))])),
                                                ('subreddit',
                                                 Pipeline(steps=[('imputer',
                                                                  Tex

In [11]:
%%time
preds = pipeline.predict_proba(X_valid)

CPU times: user 8.15 s, sys: 83.6 ms, total: 8.23 s
Wall time: 8.23 s


In [12]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('comment',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='comment',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='comment')),
                                                  ('tfidf',
                                                   TfidfVectorizer(max_df=0.9,
                                                                   min_df=10))])),
                                 ('subreddit',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='subreddit',
                                                               value='')),
                                                  ('selector',
                               

In [13]:
with open("models/logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)