In [140]:
### Import packages to create absolute file path & make code independent of operating system

from pathlib import Path
import os.path

import warnings
warnings.filterwarnings("ignore")

### Import packages for data manipulation

import pandas as pd
import numpy as np
import re

### Import packages to visualize data

import matplotlib.pyplot as plt
import seaborn as sns

### Import packages for feature extraction

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

### Import packages for modeling
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

### Import packages for model selection and performance assessment
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, StratifiedKFold, cross_val_score, RandomizedSearchCV, GridSearchCV, learning_curve
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss, classification_report, precision_recall_fscore_support
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, mean_squared_error, f1_score

In [141]:
### Read in dataset

print(os.getcwd())

base_path = Path("__file__").parent
full_path = (base_path / "../../data/processed/stackoverflow_preprocessed.csv").resolve()
# Depending on running this in interactive shell vs. terminal, I need to include GitHub/FrauenLoop_NLP_Project_2020 in filepath or not...

stackoverflow = pd.read_csv(os.path.join(full_path))

/Users/HenriekeMax/Documents/Career_Development/GitHub/FrauenLoop_NLP_Project_2020/src/features


In [None]:
stackoverflow = pd.read_csv("/Users/HenriekeMax/Documents/Career_Development/GitHub/FrauenLoop_NLP_Project_2020/data/processed/stackoverflow_preprocessed.csv")

In [144]:
stackoverflow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29986 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            29986 non-null  int64 
 1   score                 29986 non-null  int64 
 2   question_title        29986 non-null  object
 3   question_text         29986 non-null  object
 4   answer_count          29986 non-null  int64 
 5   comment_count         29986 non-null  int64 
 6   creation_date         29986 non-null  object
 7   tags                  29986 non-null  object
 8   view_count            29986 non-null  int64 
 9   answer_text           29986 non-null  object
 10  score_cat             29986 non-null  int64 
 11  question_title_clean  29986 non-null  object
 12  question_text_clean   29986 non-null  object
 13  tags_clean            29694 non-null  object
 14  answer_text_clean     29986 non-null  object
dtypes: int64(6), object(9)
memory usage:

In [143]:
### Drop all observations / rows with any missing values

stackoverflow = stackoverflow.dropna(how='any', subset=['question_title_clean', 'answer_text_clean'])

In [145]:
### Print out dataset for overview

stackoverflow.head()

Unnamed: 0.1,Unnamed: 0,score,question_title,question_text,answer_count,comment_count,creation_date,tags,view_count,answer_text,score_cat,question_title_clean,question_text_clean,tags_clean,answer_text_clean
0,0,0,How to avoid Bot repeating command in groups?,<p>I created a new Telegram Bot which maintain...,2,0,2019-08-23 14:05:29.463000+00:00,telegram-bot|python-telegram-bot,79,"<p>The bot only is replying message, not creat...",0,avoid bot repeat command group,create new telegram bot maintain simple list c...,telegrambot pythontelegrambot,bot reply message create message use sendmessage
1,1,-3,how to host Asp.net core web application..?,<p>I have more confusion with hosting my appli...,2,0,2020-04-27 22:38:03.737000+00:00,asp.net-core|shared-hosting|cloud-hosting,47,<p>If choose their windows cloud hosting platf...,0,host asp net core web application,confusion host application build application a...,asp netcore sharedhosting cloudhosting,choose window cloud host platform net core wor...
2,2,0,discord.ext.commands.errors.MissingRequiredArg...,<p>i would like make a command for set permiss...,2,0,2020-02-21 01:10:53.553000+00:00,discord.py|discord.py-rewrite,103,"<p>What you're doing right now, is <strong>req...",0,discord ext command error missingrequiredargum...,would like make command set permission text ch...,discord py discord pyrewrite,right now require guild argument command bot a...
3,3,0,JPA not saving to DB on WildFly 16,<p>After migrating from Wildfly-8.2.0 to Wildf...,1,1,2019-11-03 23:51:01.683000+00:00,jpa|jakarta-ee|persistence|jpa-2.1|wildfly-16,22,"<p>What was missing is to add the ""eclipselink...",0,jpa save db wildfly,migrate wildfly wildfly jee application launch...,jpa jakartaee persistence jpa wildfly,miss add eclipselink jar file wildfly module s...
4,4,0,Conditionally rendering an array within an arr...,<p>I've tried many of the methods outlined by ...,1,5,2020-02-23 03:24:42.360000+00:00,javascript|html|reactjs,40,<p>You need to trigger a re-render. This is do...,0,conditionally render array within array within...,ive try many method outline user dice im sure ...,javascript html reactjs,need trigger rerender do call tell content com...


In [180]:
### Count number of words in an answer

class WordCounter(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        ### Variable name to compute number of words on
        name = df.columns
        ### Make into list
        answer_list = df[name[0]].tolist()
        ### Compute number of words for each answer
        wordcount = [len(re.findall(r'\w+', str(answer))) for answer in answer_list]
        ### Make into a pandas df
        df_new = pd.DataFrame(wordcount)
        ### Add suffix
        df_new = df_new.add_suffix(name)
        return df_new

    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [181]:
### Check if WordCounter class works as desired

wordcounter = WordCounter(stackoverflow[['answer_text_clean']])
stackoverflow_new = wordcounter.transform(stackoverflow[['answer_text_clean']])
stackoverflow_new.head()

Unnamed: 0,"0Index(['answer_text_clean'], dtype='object')"
0,7
1,22
2,92
3,34
4,82


In [149]:
### Determining whether or not answer contains code

class CodeCheck(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        ### Check if answer contains code or not
        df_new = df[['answer_text']].copy()
        df_new['code_binary'] = df_new['answer_text'].str.contains('<code>', regex=False)*1      
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [150]:
### Check if CodeCheck class works as desired

codecheck = CodeCheck(stackoverflow) 
stackover_new = codecheck.transform(stackoverflow)

### Check of possible patterns in code existence and answer score

stack_new['code_binary'].value_counts()

1    24144
0     5550
Name: code_binary, dtype: int64

In [151]:
### Determining whether or not answer contains code

class CodeCounter(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        ### Check if answer contains code or not
        df_new = df[['answer_text']].copy()
        df_new['code_count'] = df_new['answer_text'].str.count('<code>')     
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [152]:
### Check if CodeCheck class works as desired

codecount = CodeCounter(stackoverflow) 
stackover_new = codecount.transform(stackoverflow)

stackover_new.head()

Unnamed: 0,code_count
0,0
1,0
2,4
3,0
4,7


In [153]:
### Check distribution of code counts

stackover_new['code_count'].value_counts().sort_index()

0      5573
1      8795
2      5187
3      3087
4      1994
       ... 
80        1
85        1
88        1
101       1
102       1
Name: code_count, Length: 61, dtype: int64

In [170]:
### Compute n grams from a dataframe for a given variable

class Ngrams(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        ### Save name of variable to analyze
        name = df.columns
        #### Initiate TfidfVectorizer
        vectorizer = TfidfVectorizer(strip_accents = 'unicode', use_idf = True, \
                                     stop_words = 'english', analyzer = 'word', \
                                     ngram_range = (1, 1) , max_features = 300)

        ### Fit to data
        X_train = vectorizer.fit_transform(df[name[0]].values.astype(str))
        # X_train = X_train.toarray()
        # is this needed? how do I address mismatching shape problem

        ### Return sparse matrix
        return X_train
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [171]:
ngrams = Ngrams(stackoverflow['answer_text_clean'])

stackover_new = ngrams.transform(stackoverflow[['answer_text_clean']])

print(stackover_new)

(0, 284)	0.18668056133161562
  (0, 62)	0.32526564334220925
  (0, 171)	0.9270127449404844
  (1, 265)	0.5808322803582754
  (1, 231)	0.45184055316117894
  (1, 298)	0.35818577823349645
  (1, 296)	0.5746102373644851
  (2, 137)	0.1084195698364375
  (2, 89)	0.13026371620718574
  (2, 122)	0.30887861748849305
  (2, 108)	0.0810824222277791
  (2, 195)	0.2614231922161136
  (2, 22)	0.2992936152564825
  (2, 69)	0.24241722218621634
  (2, 239)	0.13587661388280617
  (2, 247)	0.19341495682494572
  (2, 79)	0.12938860119583803
  (2, 294)	0.09225096235418555
  (2, 200)	0.1255211188324691
  (2, 184)	0.09890953079283116
  (2, 262)	0.09015166538768556
  (2, 58)	0.12912467053591078
  (2, 159)	0.09986626258867055
  (2, 3)	0.12765289393425364
  (2, 45)	0.5698260280507296
  :	:
  (29983, 4)	0.1375946298046813
  (29983, 284)	0.08630375651032202
  (29984, 142)	0.2865915490787619
  (29984, 105)	0.1441529020330205
  (29984, 196)	0.2692668444143726
  (29984, 222)	0.1408519561941906
  (29984, 21)	0.15569793420222822
  

In [182]:
### Split into predictors and outcome data

y = stackoverflow['score_cat']
# y = label_binarize(y, classes=[0, 1, 2]) --> to accommodate roc
X = stackoverflow.drop(['score_cat', 'score', 'answer_count', 'comment_count', 'creation_date', 'view_count'] , axis=1)

In [183]:
### Split into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [184]:
### Model selection process: Create list of different classifiers/algorithms to try out

classifiers = [
    KNeighborsClassifier(),
    SVC(random_state=1),
    DecisionTreeClassifier(random_state=1),
    RandomForestClassifier(random_state=1),
    GradientBoostingClassifier(random_state=1)
    ]

In [191]:
### Model selection process: Loop through the different classifiers using the pipeline

for classifier in classifiers:
    model_pipeline = Pipeline([
        ('feats', FeatureUnion([
            # Ngrams
            ('ngram', Ngrams(X_train[['answer_text_clean']]))
            # Wordcounter
            # ('wordcount', WordCounter(X_train[['answer_text_clean']]))
            # Code contained
            # ('codecheck', CodeCheck(X_train)),
            # No. of code snippets
            # ('codecounter', CodeCounter(X_train))
            ])),
            # Classifier
            ('classifier', classifier)])
    model_pipeline.fit(X_train, y_train)
    y_predict = model_pipeline.predict(X_test)
    print(classifier)
    print(metrics.classification_report(y_test, y_predict))

    # map predictions on to dataframe, then create a column if prediction correct and understand false predictions
    # which samples from great answers have been predicted bad
    # which samples from bad answers have been predicted great?

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.34      1.00      0.51      3056
           1       0.90      0.03      0.06      3007
           2       0.00      0.00      0.00      2933

    accuracy                           0.35      8996
   macro avg       0.41      0.34      0.19      8996
weighted avg       0.42      0.35      0.19      8996

SVC(random_state=1)
              precision    recall  f1-score   support

           0       0.80      0.03      0.05      3056
           1       0.92      0.03      0.06      3007
           2       0.33      1.00      0.50      2933

    accuracy                           0.34      8996
   macro avg       0.69      0.35      0.20      8996
weighted avg       0.69      0.34      0.20      8996

DecisionTreeClassifier(random_state=1)
              precision    recall  f1-score   support

           0       0.80      0.03      0.05      3056
           1       0.92      0.03      0.06    

In [194]:
### Instantiate classifier

classifier = GradientBoostingClassifier(random_state = 1)

In [195]:
### Define the model cross-validation configuration

cv = KFold(n_splits=5, shuffle=True, random_state=1)

In [90]:
### Choose best-performing model to tune using random hyperparameter grid

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

### Create random grid
random_grid = {'classifier__n_estimators': n_estimators,
               'classifier__max_features': max_features,
               'classifier__max_depth': max_depth,
               'classifier__min_samples_split': min_samples_split,
               'classifier__min_samples_leaf': min_samples_leaf}

print(random_grid)

{'classifier__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'classifier__max_features': ['auto', 'sqrt'], 'classifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'classifier__min_samples_split': [2, 5, 10], 'classifier__min_samples_leaf': [1, 2, 4]}


In [91]:
### Find best combination of parameters using randomized hyperparameter search

random_grid_classifier = RandomizedSearchCV(model_pipeline, param_distributions = random_grid, n_iter = 100, cv = cv, verbose=2, random_state=42, n_jobs = -1)

random_grid_classifier.fit(X_train, y_train)

print(random_grid_classifier.best_params_)

print(random_grid_classifier.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 32.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 74.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 166.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 239.8min finished
{'classifier__n_estimators': 2000, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'sqrt', 'classifier__max_depth': None}
0.37238393071926873


In [196]:
model_pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams
        ('ngram', Ngrams(X_train[['answer_text_clean']]))
        # Wordcounter
        # ('wordcount', WordCounter(X_train[['answer_text_clean']]))
        # Code contained
        # ('codecheck', CodeCheck(X_train)),
        # No. of code snippets
        # ('codecounter', CodeCounter(X_train))
        ])),
        # Classifier
        ('classifier', classifier)])
model_pipeline.fit(X_train, y_train)
y_predict = model_pipeline.predict(X_test)
print(classifier)
print(metrics.classification_report(y_test, y_predict))

GradientBoostingClassifier(random_state=1)
              precision    recall  f1-score   support

           0       0.80      0.03      0.05      3056
           1       0.92      0.03      0.06      3007
           2       0.33      1.00      0.50      2933

    accuracy                           0.34      8996
   macro avg       0.69      0.35      0.20      8996
weighted avg       0.69      0.34      0.20      8996



In [197]:
### Create param grid based on results from random grid search

param_grid = {'classifier__n_estimators': [1000, 2000, 3000, 4000],
               'classifier__max_features': ['sqrt'],
               'classifier__max_depth': [None, 10, 20],
               'classifier__min_samples_split': [1, 2, 3, 4],
               'classifier__min_samples_leaf': [1, 2, 3]}

print(param_grid)

{'classifier__n_estimators': [1000, 2000, 3000, 4000], 'classifier__max_features': ['sqrt'], 'classifier__max_depth': [None, 10, 20], 'classifier__min_samples_split': [1, 2, 3, 4], 'classifier__min_samples_leaf': [1, 2, 3]}


In [202]:
### Choose best-performing model to tune using GridSearchCV

grid_classifier = GridSearchCV(model_pipeline, param_grid = param_grid, cv=cv, iid=False, n_jobs=-1, refit = True)
# scoring='roc_auc' --> reincorporate
grid_classifier.fit(X_train, y_train)

print("Best result: %f using parameters %s" % (grid_classifier.best_score_, grid_classifier.best_params_))

### Assess model performance on test data
print("Model Score assessed on test data: %.3f" % grid_classifier.score(X_test, y_test))

print("Classification Report:", classification_report(y_test, grid_classifier.predict(X_test)))

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

In [None]:
### Define classifier with tuned parameters and model pipline

classifier = GradientBoostingClassifier(max_depth = 10, 
                                        max_features = 'auto', 
                                        min_samples_leaf = 5,
                                        min_samples_split = 3, 
                                        n_estimators = 200, 
                                        random_state = 1)

In [None]:
### Fitting pipeline to train data

model_pipeline.fit(X_train, y_train)

### Assess model performance on test data

print("model score: %.3f" % model_pipeline.score(X_test, y_test))

In [None]:

## confusion_matrix(y_test, grid_search.predict(X_test))

  # confm_hold = confusion_matrix(y_test, y_predict)
    # print(confm_hold)

# np.array(s)
## confm_hold_df = pd.DataFrame(confm_hold, index = ['No Medal', 'Medal'],
                               # columns = ['No Medal', 'Medal'])
## plt.figure(figsize=(5,4))
## sns.heatmap(confm_hold_df, annot=True, fmt=".4f", linewidths=.5, square = True)

In [None]:
### Pipe different features in with a name so the step can be later called for details

pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams
        ('ngram_all', Ngrams(X_train[['answer_text_clean']]))
    ])),
     # Classifier
     ('kneighbors', KNeighborsClassifier(n_neighbors=5, leaf_size=40))])

In [None]:
### Cross validation and tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'kneighbors__n_neighbors': (3, 5, 10),
            'kneighbors__leaf_size': (10, 20 , 30),
            'kneighbors__p': (1,2)
            }

In [None]:
### Find best model
grid_search = GridSearchCV(pipeline, param_grid, cv=5, iid=False, n_jobs=-1, refit = True)
grid_search.fit(X_train, y_train)

In [None]:
### Print best model
print("Best score:", grid_search.best_score_)
print("Best params:", grid_search.best_params_)

print(classification_report(y_test, grid_search.predict(X_test)))

In [None]:
### To Do

# Finalize cleaner function (whitespaces etc.)
# Additional features, e.g.
    ### Figure out no. of switches from code to explanation
    ### Extract tags into separate columns and one-hot-encode

# Play with different ngram (1,2,3) and max feature numbers
# Incorporate functions/call them in pipeline
# Try out different models
# Hypertune model


# Add cross validation
# Look at mispredictions to make more targeted features
# Make those features
# Model and you can also try additional types of models