In [None]:
### To Do

# try word embeddings using gensim

In [10]:
### Import packages to create absolute file path & make code independent of operating system

from pathlib import Path
import os.path
import sys

import warnings
warnings.filterwarnings("ignore")

### Import packages for data manipulation

import pandas as pd
import numpy as np
import re

### Import packages to visualize data

import matplotlib.pyplot as plt
import seaborn as sns

### Import packages for feature extraction

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher

### Import packages for modeling
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

### Import packages for model selection and performance assessment
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, StratifiedKFold, cross_val_score, RandomizedSearchCV, GridSearchCV, learning_curve
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss, classification_report, precision_recall_fscore_support
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, mean_squared_error, f1_score

In [11]:
### Read in dataset

print(os.getcwd())

base_path = Path("__file__").parent
full_path = (base_path / "../data/processed/stackoverflow_modeling.csv").resolve()
# Depending on running this in interactive shell vs. terminal, I need to include GitHub/FrauenLoop_NLP_Project_2020 in filepath or not...

stackoverflow = pd.read_csv(os.path.join(full_path))

/Users/HenriekeMax/Documents/Career_Development/GitHub/Predicting-Helpfulness-Of-Stackoverflow-Answers/src/model_training


In [85]:
stackoverflow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29969 entries, 0 to 29968
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            29969 non-null  int64 
 1   Unnamed: 0.1          29969 non-null  int64 
 2   question_score        29969 non-null  int64 
 3   answer_score          29969 non-null  int64 
 4   question_title        29969 non-null  object
 5   question_text         29969 non-null  object
 6   answer_count          29969 non-null  int64 
 7   comment_count         29969 non-null  int64 
 8   creation_date         29969 non-null  object
 9   tags                  29969 non-null  object
 10  view_count            29969 non-null  int64 
 11  answer_text           29969 non-null  object
 12  score_cat_all         29969 non-null  int64 
 13  question_title_clean  29969 non-null  object
 14  question_text_clean   29969 non-null  object
 15  tags_clean            29316 non-null

In [86]:
### Print out dataset for overview

stackoverflow.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,question_score,answer_score,question_title,question_text,answer_count,comment_count,creation_date,tags,view_count,answer_text,score_cat_all,question_title_clean,question_text_clean,tags_clean,answer_text_clean,tag_list_clean
0,0,0,1,-1,Displaying based upon if NULL is found in Column,<p>I have a data base that I update on a daily...,1,8,2019-10-24 12:57:07.473000+00:00,php|html|sql,52,<p>Rather than checking for NULL you should be...,0,display base upon find column,data base update daily basis attempt create we...,php html sql,rather check check rowdata rowdata print compl...,php html sql
1,1,1,-2,-2,Cannot Update Composer in Command Line,<p>I am updating the composer and it shows up ...,2,6,2019-06-21 08:30:56.677000+00:00,php|laravel|composer-php,450,<p>try following command </p>\n\n<p>install fi...,0,can not update composer command line,update composer show error like load composer ...,php laravel composer php,ptry follow command install first curl use fol...,php laravel composer-php
2,2,2,0,-1,How to swap huge string,<p>I have huge string e.g (This is just the pa...,2,2,2019-07-23 06:39:24.553000+00:00,c#,98,<p>here</p>\n\n<pre><code>public static void M...,0,swap huge string,huge string part string look bit different val...,,phere public static void main string nums var ...,c#
3,3,3,0,-1,Exception is not being caught,<p>Crashlytics reports that the following line...,3,3,2019-06-27 20:02:42.157000+00:00,ios|swift|nsattributedstring|nsmutableattribut...,131,<p>I guess the crash occurs when you try to co...,0,exception catch,crashlytics report follow line sometimes throw...,io swift nsattributedstring nsmutableattribute...,guess crash occur try convert likely get error...,ios swift nsattributedstring nsmutableattribut...
4,4,4,-1,-1,Spring Boot - Do a new WebClient call with res...,<p>I'm trying to call an api with 2 call using...,1,1,2019-06-10 14:15:00.307000+00:00,spring-boot|spring-webclient,41,<p>This is the last code:</p>\n\n<pre><code>pu...,0,spring boot new web client call result another...,im try call api call use webclient first call ...,spring boot spring webclient,last code public void get indici try object ma...,spring-boot spring-webclient


In [12]:
### Determining similarity of question and answer
class Similarity(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        
        ### Define similarity function
        def similar(column1, column2):
            return SequenceMatcher(None, column1, column2).ratio()
        ### Calculate similarity score between question and answer
        df_new = df[['answer_text_clean', 'question_text_clean']].copy()
        df_new['similarity_score'] = df_new.apply(lambda x: similar(str(x['answer_text_clean']), str(x['question_text_clean'])), axis = 1)
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text_clean', 'question_text_clean'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [13]:
### Check if Similarity score class works as desired

similarity_scorer = Similarity(stackoverflow)
stackoverflow_new = similarity_scorer.transform(stackoverflow)
stackoverflow_new.head(10)

Unnamed: 0,similarity_score
0,0.011077
1,0.015982
2,0.03169
3,0.037935
4,0.006579
5,0.012739
6,0.155268
7,0.011614
8,0.008902
9,0.012146


In [13]:
### Determining similarity of question and answer
class JaccardSimilarity(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        
        ### Define Jaccard Similarity function
        def get_jaccard_sim(column1, column2): 
            a = set(column1.split()) 
            b = set(column2.split())
            c = a.intersection(b)
            return float(len(c)) / (len(a) + len(b) - len(c))
            
        ### Calculate similarity score between question and answer
        df_new = df[['answer_text_clean', 'question_text_clean']].copy()
        df_new['jaccard_similarity_score'] = df_new.apply(lambda x: get_jaccard_sim(str(x['answer_text_clean']), str(x['question_text_clean'])), axis = 1)
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text_clean', 'question_text_clean'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [42]:
### Check if Jaccard Similarity score class works as desired

jaccard_similarity = JaccardSimilarity(stackoverflow)
stackoverflow_new = jaccard_similarity.transform(stackoverflow)
stackoverflow_new.head(10)

Unnamed: 0,jaccard_similarity_score
0,0.045977
1,0.056604
2,0.142857
3,0.147059
4,0.114035
5,0.180328
6,0.057971
7,0.35
8,0.0625
9,0.122302


In [14]:
### Count number of words in an answer

class WordCounter(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        ### Variable name to compute number of words on
        name = df.columns
        ### Make into list
        answer_list = df[name[0]].tolist()
        ### Compute number of words for each answer
        wordcount = [len(re.findall(r'\w+', str(answer))) for answer in answer_list]
        ### Make into a pandas df
        df_new = pd.DataFrame(wordcount)
        ### Add suffix
        df_new = df_new.add_suffix(name)
        return df_new

    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [16]:
### Check if WordCounter class works as desired

wordcounter = WordCounter(stackoverflow[['answer_text_clean']])
stackoverflow_new = wordcounter.transform(stackoverflow[['answer_text_clean']])
stackoverflow_new.head()

Unnamed: 0,"0Index(['answer_text_clean'], dtype='object')"
0,12
1,20
2,24
3,29
4,234


In [15]:
### Determining whether or not answer contains code

class CodeCheck(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        ### Check if answer contains code or not
        df_new = df[['answer_text']].copy()
        df_new['code_binary'] = df_new['answer_text'].str.contains('<code>', regex=False)*1      
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [18]:
### Check if CodeCheck class works as desired

codecheck = CodeCheck(stackoverflow) 
stackover_new = codecheck.transform(stackoverflow)

### Check ratio of code vs. no code in answers

stackover_new['code_binary'].value_counts()

1    24849
0     5120
Name: code_binary, dtype: int64

In [16]:
### Determining whether or not answer contains code

class CodeCounter(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        ### Check if answer contains code or not
        df_new = df[['answer_text']].copy()
        df_new['code_count'] = df_new['answer_text'].str.count('<code>')     
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [21]:
### Check if CodeCounter class works as desired

codecount = CodeCounter(stackoverflow) 
stack_new = codecount.transform(stackoverflow)

stack_new.head()

Unnamed: 0,code_count
0,0
1,2
2,1
3,3
4,2


In [22]:
### Check distribution of code counts

stack_new['code_count'].value_counts().sort_index()

0      5120
1      8040
2      4872
3      3030
4      2092
       ... 
110       1
148       1
152       1
173       1
263       1
Name: code_count, Length: 82, dtype: int64

In [17]:
### Compute n grams from a dataframe for a given variable

class Ngrams(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        ### Save name of variable to analyze
        name = df.columns
        #### Initiate TfidfVectorizer
        vectorizer = TfidfVectorizer(strip_accents = 'unicode', use_idf = True, \
                                     stop_words = 'english', analyzer = 'word', \
                                     ngram_range = (1, 2) , max_features = 300)
        ### Fit to data
        X_train = vectorizer.fit_transform(df[name[0]].values.astype(str))

        ### Return sparse matrix
        return X_train
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [72]:
ngrams = Ngrams(stackoverflow['answer_text_clean'])

stackover_new = ngrams.transform(stackoverflow[['answer_text_clean']])

print(stackover_new)

(0, 201)	0.7397285752112934
  (0, 32)	0.6729053685443962
  (1, 282)	0.2067179708566376
  (1, 132)	0.7638785142438582
  (1, 44)	0.47425068615455956
  (1, 102)	0.3858025177256358
  (2, 14)	0.21995484606469434
  (2, 150)	0.22029297314401286
  (2, 298)	0.22576822480300632
  (2, 49)	0.24375649793696597
  (2, 181)	0.46007810887925377
  (2, 286)	0.44042956240047626
  (2, 259)	0.37474309331415956
  (2, 163)	0.23723621857620783
  (2, 290)	0.2508246398956586
  (2, 252)	0.2668817403603042
  (2, 210)	0.23706541638356413
  (3, 16)	0.34704170178359384
  (3, 82)	0.2538723547060552
  (3, 186)	0.30424019836743954
  (3, 65)	0.2215364428964045
  (3, 87)	0.4965548581815757
  (3, 58)	0.3222252629716368
  (3, 278)	0.4354087519542866
  (3, 150)	0.260417879208496
  :	:
  (29967, 151)	0.022820144639976555
  (29967, 250)	0.040479989078968756
  (29967, 214)	0.022546981908775566
  (29967, 45)	0.024562400945490115
  (29967, 159)	0.40348108681687656
  (29967, 109)	0.03107171828322712
  (29967, 248)	0.13098912526600

In [18]:
### One-hot encode question tags

from sklearn.preprocessing import MultiLabelBinarizer

class TagEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        # One-hot encode list of tags from tag-column
        df_copy = df[['tag_list_clean']].copy()
        mlb = MultiLabelBinarizer()
        df_new = pd.DataFrame(mlb.fit_transform(df_copy.tag_list_clean.values), df.index, mlb.classes_)
        return df_new

    def fit(self, df, y=None):
        ### Unless error returns self
        return self

### fix this using split to get back to full words

In [19]:
tag_encoder = TagEncoder(stackoverflow)

stackover_new = tag_encoder.transform(stackoverflow)

print(stackover_new)

#  +  -  .  0  1  2  3  4  ...  q  r  s  t  u  v  w  x  y  z
0      1  0  0  0  0  0  0  0  0  0  ...  1  0  1  1  0  0  0  0  0  0
1      1  0  0  1  0  0  0  0  0  0  ...  0  1  1  0  0  1  0  0  0  0
2      0  1  0  0  0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0  0  0
3      1  0  0  0  0  0  0  0  0  0  ...  0  1  1  1  1  0  1  0  0  0
4      1  0  0  1  0  0  0  0  0  0  ...  0  1  1  1  0  0  1  0  0  0
...   .. .. .. .. .. .. .. .. .. ..  ... .. .. .. .. .. .. .. .. .. ..
29964  1  0  0  1  0  0  0  0  0  0  ...  0  1  1  1  0  1  0  0  0  0
29965  1  0  0  0  0  0  0  0  0  0  ...  0  1  1  1  1  0  0  0  0  0
29966  1  0  0  1  0  0  0  0  0  0  ...  0  1  1  1  0  1  1  0  0  1
29967  1  0  0  0  0  0  0  0  0  0  ...  0  1  1  1  0  1  0  0  0  0
29968  1  0  0  0  1  0  0  0  0  0  ...  0  1  0  1  0  0  0  0  0  0

[29969 rows x 41 columns]


In [20]:
### One-hot encode top 50 question tags

class TopTagEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        df_new = df[['tag_list_clean']].copy()
        var_list = ['javascript', 'java', 'python', 'c#', 'php', 'android', 'html', 'c++', 'jquery', 'css', 'ios', 'mysql', 'sql', 'asp.net', 'r', 'node.js', 'arrays', 'c', 'ruby-on-rails', '.net', 'json', 'objective-c', 'sql-server', 'swift', 'angularjs', 'python-3.x', 'django', 'reactjs', 'excel', 'regex', 'angular', 'iphone', 'ruby', 'ajax', 'xml', 'linux', 'asp.net-mvc', 'vba', 'spring', 'database', 'wordpress', 'panas', 'wpf', 'string', 'laravel', 'xcode', 'windows', 'mongodb', 'vb.net', 'bash']
        for var in var_list:
            ### Create column name "has_tagname" for each tag in list
            new_var_name = "%s_%s" % ("has", var)
            ### Create dataframe column for each tag, and if original tag-column contains tag, assign 1
            df_new[new_var_name] = df_new['tag_list_clean'].str.contains(re.escape(var), regex=True)*1
        df_new = df_new.drop(columns = ['tag_list_clean'], axis = 1)
        return df_new

    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [122]:
top_tag_encoder = TopTagEncoder(stackoverflow)

stackover_new_top = top_tag_encoder.transform(stackoverflow)

stackover_new_top.head()

Unnamed: 0,has_javascript,has_java,has_python,has_c#,has_php,has_android,has_html,has_c++,has_jquery,has_css,...,has_wordpress,has_panas,has_wpf,has_string,has_laravel,has_xcode,has_windows,has_mongodb,has_vb.net,has_bash
0,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
val_freq = df[your_column].value_counts() #finds the frequencies of the values and sorts them
good_vals = val_freq[:100].index #takes the top 100 values
df[your_column][~df['your_column'].isin(good_vals)]='Other' #replaces the values not in the top 100 by "Other"

# https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list/45313942#45313942

In [21]:
### Split into predictors and outcome data

y = stackoverflow['score_cat_all']
X = stackoverflow.drop(['answer_score', 'question_score', 'answer_count', 'comment_count', 'creation_date', 'view_count', 'score_cat_all'] , axis=1)

In [22]:
### Split into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [23]:
### Model selection process: Create list of different classifiers/algorithms to try out

classifiers = [
    LogisticRegression(random_state=1),
    KNeighborsClassifier(),
    SVC(random_state=1),
    DecisionTreeClassifier(random_state=1),
    RandomForestClassifier(random_state=1),
    GradientBoostingClassifier(random_state=1)
    ]

In [127]:
### Model selection process: Loop through the different classifiers using the pipeline

for classifier in classifiers:
    model_pipeline = Pipeline([
        ('feats', FeatureUnion([
            # Ngrams
            ('ngram', Ngrams(X_train[['answer_text_clean']])),
            # Wordcounter
            ('wordcount', WordCounter(X_train[['answer_text_clean']])),
            # Code contained
            ('codecheck', CodeCheck(X_train)),
            # No. of code snippets
            ('codecounter', CodeCounter(X_train)),
            # JaccardSimilarity of question and answer
            ('jaccard', JaccardSimilarity(X_train)),
            # Similarity of question and answer
            ('similarity', Similarity(X_train)),
            # Top tags present
            ('toptags', TopTagEncoder(X_train))
            ])),
            # Classifier
            ('classifier', classifier)])
    model_pipeline.fit(X_train, y_train)
    y_predict = model_pipeline.predict(X_test)
    print(classifier)
    print(metrics.classification_report(y_test, y_predict))

LogisticRegression(random_state=1)
              precision    recall  f1-score   support

           0       0.51      0.60      0.55      3047
           1       0.48      0.32      0.38      3023
           2       0.51      0.57      0.54      2921

    accuracy                           0.50      8991
   macro avg       0.50      0.50      0.49      8991
weighted avg       0.50      0.50      0.49      8991

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.46      0.56      0.51      3047
           1       0.40      0.38      0.39      3023
           2       0.46      0.39      0.42      2921

    accuracy                           0.44      8991
   macro avg       0.44      0.44      0.44      8991
weighted avg       0.44      0.44      0.44      8991

SVC(random_state=1)
              precision    recall  f1-score   support

           0       0.49      0.68      0.57      3047
           1       0.43      0.37      0.40      30

In [28]:
model_pipeline = Pipeline([
    ('feats', FeatureUnion([
            # Ngrams
            ('ngram', Ngrams(X_train[['answer_text_clean']])),
            # Wordcounter
            ('wordcount', WordCounter(X_train[['answer_text_clean']])),
            # Code contained
            ('codecheck', CodeCheck(X_train)),
            # No. of code snippets
            ('codecounter', CodeCounter(X_train)),
            # JaccardSimilarity of question and answer
            ('jaccard', JaccardSimilarity(X_train)),
            # Similarity of question and answer
            ('similarity', Similarity(X_train)),
            # Top tags present
            ('toptags', TopTagEncoder(X_train))
            ])),
            # Classifier
            ('classifier', GradientBoostingClassifier(random_state=1))])
model_pipeline.fit(X_train, y_train)
y_predict = model_pipeline.predict(X_test)
print("GradientBoostingClassifier")
print(metrics.classification_report(y_test, y_predict))

GradientBoostingClassifier
              precision    recall  f1-score   support

           0       0.50      0.66      0.57      3047
           1       0.48      0.37      0.42      3023
           2       0.54      0.49      0.52      2921

    accuracy                           0.51      8991
   macro avg       0.51      0.51      0.50      8991
weighted avg       0.51      0.51      0.50      8991



In [29]:
###

y_test.head()

5207     0
10475    1
13742    1
6085     0
13453    1
Name: score_cat_all, dtype: int64

In [30]:
### Mapping predicted scores for test data onto actual scores
 
y_predict = model_pipeline.predict(X_test)
y_predict_df = pd.DataFrame(data = y_predict, columns=['predicted_score_cat'], index = X_test.index.copy())
df_test = pd.merge(X_test, y_predict_df, left_index = True, right_index = True)

df_test['score_cat_all'] = y_test

In [31]:
df_test.head(20)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,question_title,question_text,tags,answer_text,question_title_clean,question_text_clean,tags_clean,answer_text_clean,tag_list_clean,predicted_score_cat,score_cat_all
5207,5213,5213,"Python: how to copy between lists using lists,...","<p>Say we have two lists of values, <code>list...",python|python-3.x|list,"<pre><code>list1 = ['a','b','c','d','e']\nlist...",python copy list use list list index list index,say two list value tow list index copy element...,python python list,pre list list index index for index list list ...,python python-3.x list,0,0
10475,10488,10488,I'm having trouble with the sort order in a kb...,<p>I have a Delphi 10.1 Berlin application tha...,sorting|delphi|indexing,<p>Its because the locale settings defines the...,im trouble sort order kbmmemtable,delphi berlin application use kbm mem table ta...,sort delphi indexing,locale setting defines order try set mtifo ign...,sorting delphi indexing,0,1
13742,13758,13758,CFG: Why is this grammar ambiguous?,<p>The grammar is as follows.</p>\n\n<pre><cod...,grammar|context-free-grammar|context-free-lang...,<p>It isn't ambiguous. Your analysis is corre...,grammar ambiguous,grammar follow gt gt way understand derivation...,grammar context free grammar context free lang...,isnt ambiguous analysis correct heres mechanic...,grammar context-free-grammar context-free-lang...,2,1
6085,6092,6092,How to let Kafka Streams send one record per k...,<p>I'm writing a Kafka Streams App. It does th...,apache-kafka|apache-kafka-streams,<ol>\n<li>I would recommend you to use exactly...,let kafka stream send one record per key per h...,im write kafka stream app follow step consumes...,apache kafka apache kafka stream,ol would recommend use exactly guarantee provi...,apache-kafka apache-kafka-streams,0,0
13453,13468,13468,delete kubernetes cluster on docker-for-deskto...,<p>What is the equivalent command for <code>mi...,docker|kubernetes|kubectl|minikube,<p>In recent Docker Edge versions for Mac ( 2....,delete kubernetes cluster docker desktop,equivalent command docker desktop understand m...,docker kubernetes kubectl minikube,recent docker edge version mac design change r...,docker kubernetes kubectl minikube,2,1
1906,1908,1908,Resource not found errors in Ruby on rails,"<p>I'm very new to RoR, I know that ActiveReco...",ruby-on-rails|http-status-code-404|custom-errors,<p>Missing </p>\n\n<pre><code>def index\n\nend...,resource find error ruby rail,im new ro know active record record find map h...,ruby rail http status code custom error,miss def indexend restart computer everything ...,ruby-on-rails http-status-code-404 custom-errors,0,0
18549,18569,18569,How to prevent black background under control ...,"<p><a href=""https://i.stack.imgur.com/IdW9P.pn...",user-interface|delphi|background,<p>Try this:</p>\n\n<pre><code>interface\n\n ...,prevent black background control repaint,screenshot take window activation see control ...,user interface delphi background,try interface form class form protect procedur...,user-interface delphi background,0,1
27824,27854,27854,How to create a Capture dynamically (Raku),<p>In the following example I try to create a ...,dynamic|capture|raku,<p>In the class <code>List</code> there is the...,create capture dynamically raku,follow example try create capture dynamically ...,dynamic capture raku,class method work exactly want test test test ...,dynamic capture raku,2,2
2996,3000,3000,Custom sort for histogram,<p>After looking at countless questions and an...,python|pandas|matplotlib,<p>Update of your code with the answers in the...,custom sort histogram,look countless question answer custom sort bar...,python panda matplotlib,update code answer comment matplotlib import p...,python pandas matplotlib,0,0
9280,9292,9292,remove quotes from a json file using python,<p>the dataframe 'dataset' is automatically ge...,python,<p>simply u can use eval function.</p>\n\n<pre...,remove quote json file use python,pthe dataframe dataset automatically generated...,python,psimply use eval function new coordinate statu...,python,0,0


In [38]:
df_test.groupby(['score_cat_all', 'predicted_score_cat']).size().unstack(fill_value=0)

predicted_score_cat,0,1,2
score_cat_all,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1994,576,477
1,1140,1023,860
2,893,676,1352


In [42]:
# Examining if correctly predicted bad answers tend to have negative scores
df_test_new = df_test[(df_test['score_cat_all'] == 0) & 
          (df_test['predicted_score_cat'] == 0)] 

df_test_new.head(30)

Unnamed: 0.1,Unnamed: 0,question_title,question_text,tags,answer_text,score_cat_all,question_title_clean,question_text_clean,tags_clean,answer_text_clean,predicted_score_cat
5213,5213,"Python: how to copy between lists using lists,...","<p>Say we have two lists of values, <code>list...",python|python-3.x|list,"<pre><code>list1 = ['a','b','c','d','e']\nlist...",0,python copy list use list list index list index,say two list value tow list index copy element...,python python list,pre list list index index for index list list ...,0
6092,6092,How to let Kafka Streams send one record per k...,<p>I'm writing a Kafka Streams App. It does th...,apache-kafka|apache-kafka-streams,<ol>\n<li>I would recommend you to use exactly...,0,let kafka stream send one record per key per h...,im write kafka stream app follow step consumes...,apache kafka apache kafka stream,ol would recommend use exactly guarantee provi...,0
1908,1908,Resource not found errors in Ruby on rails,"<p>I'm very new to RoR, I know that ActiveReco...",ruby-on-rails|http-status-code-404|custom-errors,<p>Missing </p>\n\n<pre><code>def index\n\nend...,0,resource find error ruby rail,im new ro know active record record find map h...,ruby rail http status code custom error,miss def indexend restart computer everything ...,0
3000,3000,Custom sort for histogram,<p>After looking at countless questions and an...,python|pandas|matplotlib,<p>Update of your code with the answers in the...,0,custom sort histogram,look countless question answer custom sort bar...,python panda matplotlib,update code answer comment matplotlib import p...,0
9292,9292,remove quotes from a json file using python,<p>the dataframe 'dataset' is automatically ge...,python,<p>simply u can use eval function.</p>\n\n<pre...,0,remove quote json file use python,pthe dataframe dataset automatically generated...,python,psimply use eval function new coordinate statu...,0
3789,3789,Reading CSV file - Object Oriented way,<p>I'd like to parse a csv file in my course t...,c#|instantiation,<p>There is really no reason to use a library ...,0,read file object orient way,id like parse csv file course attend cv file l...,instantiation,really reason use library code read simple see...,0
88,88,How to convert survey results as a percentage ...,"<p>A survey was conducted with <code>2,233</co...",python|pandas,<pre><code>df = df.astype(float)\ndf = (df/223...,0,convert survey result percentage total respond...,survey conduct respondent follow result come f...,python panda,pre df df astype float df df round,0
4417,4417,almostIncreasingSequence - Javascript,<p>Hi guys I'm using Codefights concurrently w...,javascript|performance|big-o,"<p>I came up with this solution in TypeScript,...",0,almost increase sequence javascript,hi guy im use codefights concurrently learn al...,javascript performance big,come solution type script put feedback work bo...,0
333,333,I have installed FFMPEG and can't find it insi...,<p>I have installed FFMPeg in CentOS. It works...,php|linux|ffmpeg|centos|ffmpeg-php,"<p>I think to use FFMPFG with php, the fastest...",0,instal cant find inside script,instal peg cent work perfectly inside usr bin ...,php linux ffmpeg centos ffmpeg php,think use php fast way use package http github...,0
4410,4410,Tensorflow Dataset API : Cache,<p>I am trying to use tf.Dataset.cache but it ...,caching|tensorflow|dataset,<ol>\n<li><p>The intention of .cache function ...,0,tensorflow dataset cache,try use tf dataset cache seem affect question ...,cache tensorflow dataset,ol intention cache function speed data pipelin...,0


In [34]:
### Selecting rows from df_test based on condition that predicted and actual score diverge by 2
divergence_df = df_test[(df_test['predicted_score_cat'] - df_test['score_cat_all'] == abs(2))]

divergence_df.head(40)

Unnamed: 0.1,Unnamed: 0,question_title,question_text,tags,answer_text,score_cat_all,question_title_clean,question_text_clean,tags_clean,answer_text_clean,predicted_score_cat,score_cat
1438,1438,Cards stacking over one another with the grid ...,"<p>I have 4 cards using bootstrap 4. However,...",html|css|bootstrap-4,<p>If you remove the <code>col-sm-3</code> fro...,0,card stack one another grid system,card use bootstrap however scale card overlap ...,html cs bootstrap,remove parrent work fine happen force parrent ...,2,0
1543,1543,Flutter : ListView : Scroll parent ListView wh...,<p>I'm using Flutter version 1.12.13+hotfix.</...,flutter|flutter-layout,<p>Usually when I come across an issue like th...,0,flutter list view scroll parent list view chil...,im use flutter version hotfix im look solution...,flutter flutter layout,usually come across issue like use child whate...,2,0
7389,7389,include relative path changes in class (strang...,<p>I don't want to use autoload!</p>\n\n<p>I h...,php,<p>You need read:\n1) Php namespaces\n2) Php a...,0,include relative path change class strange beh...,dont want use autoload class method include fi...,php,need read php namespaces php autoload class im...,2,0
538,538,How to check cache status in ignite,<p>I need to check cache status using apache i...,java|ignite,<p>You can see the interface Ignite</p>\n\n<pr...,0,check cache status ignite,need check cache status use apache ignite chec...,java ignite,see interface ignite check ignite grid active ...,2,0
5232,5232,Remove trailing newline from the elements of a...,<p>I have to take a large list of words in the...,python|list|strip,"<pre><code>my_list = ['this\n', 'is\n', 'a\n',...",0,remove trail newline element string list,take large list word form list word use strip ...,python list strip,pre mylist list word print strip mylist output...,2,0
69,69,Iframe sets wrong width,<p>I have a page having iframe whose width is ...,javascript|jquery|html|css|iframe,"<p>Try replace jquery or add css once, this mi...",0,iframe set wrong width,page iframe whose width assign dynamically jqu...,javascript jquery html cs iframe,try replace jquery add cs might help iframe bo...,2,0
6905,6905,import requests on AWS Lambda for python 3.8,<p>The trick to use <code>requests</code> from...,python|aws-lambda|python-requests|python-3.8,<p>Install the <code>requests</code> dependenc...,0,import request lambda python,trick use botocore vendored long work python l...,python aws lambda python request python,install dependency separately use import impor...,2,0
4347,4347,How do I add a new sourceset to Gradle?,<p>I want to add integration tests to my Gradl...,java|gradle|build|integration-testing|source-sets,"<p>I'm new to Gradle, using Gradle 6.0.1 JUnit...",0,add new sourceset gradle,want add integration test gradle build version...,java gradle build integration test source set,im new gradle use gradle unit here come solve ...,2,0
7260,7260,Having trouble understanding .join(map(re.esca...,<p>I'm having trouble understanding what .join...,python|sorting|escaping,<p>There is nothing weird in <code>reverse=Tru...,0,trouble understand join map escape example,im trouble understand join map escape example ...,python sort escape,nothing weird ry comment change introduce sort...,2,0
6833,6833,sklearn module not found in anaconda,<p>I've been trying to import sklearn but it s...,python|python-3.x|scikit-learn|anaconda,"<p><a href=""https://i.stack.imgur.com/L3yYy.pn...",0,sklearn module find anaconda,ive try import sklearn say module find python ...,python python scikit learn anaconda,although skleran module work well py charm skl...,2,0


In [43]:
divergence_df.groupby(['score_cat_all', 'predicted_score_cat']).size().unstack(fill_value=0)

KeyError: 'score'

In [49]:
### Define the model cross-validation configuration

cv = KFold(n_splits=5, shuffle=True, random_state=1)

In [50]:
cross_val_score(model_pipeline, X_train, y_train, cv=cv)

array([0.47926597, 0.47878932, 0.46520496, 0.4965435 , 0.47175209])

In [156]:
### Create param grid based on results from random grid search

param_grid = {'classifier__n_estimators': [3000],
               'classifier__max_features': ['sqrt'],
               'classifier__max_depth': [None],
               'classifier__min_samples_split': [1, 2],
               'classifier__min_samples_leaf': [1, 2]}

print(param_grid)

{'classifier__n_estimators': [3000], 'classifier__max_features': ['sqrt'], 'classifier__max_depth': [None], 'classifier__min_samples_split': [1, 2], 'classifier__min_samples_leaf': [1, 2]}


In [157]:
### Choose best-performing model to tune using GridSearchCV

grid_classifier = GridSearchCV(model_pipeline, param_grid = param_grid, cv=cv, iid=False, n_jobs=-1, refit = True)
# scoring='roc_auc' --> reincorporate

In [158]:
grid_classifier.fit(X_train, y_train)

print("Best result: %f using parameters %s" % (grid_classifier.best_score_, grid_classifier.best_params_))

Best result: 0.402153 using parameters {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 3000}


In [159]:
### Assess model performance on test data
print("Model Score assessed on test data: %.3f" % grid_classifier.score(X_test, y_test))

print("Classification Report:", classification_report(y_test, grid_classifier.predict(X_test)))

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

Model Score assessed on test data: 0.378
Classification Report:               precision    recall  f1-score   support

           0       0.39      0.09      0.14      3055
           1       0.41      0.35      0.38      3013
           2       0.36      0.71      0.48      2929

    accuracy                           0.38      8997
   macro avg       0.39      0.38      0.33      8997
weighted avg       0.39      0.38      0.33      8997



In [151]:
### Define classifier with tuned parameters

classifier = GradientBoostingClassifier(max_depth = None, 
                                        max_features = 'sqrt', 
                                        min_samples_leaf = 1,
                                        min_samples_split = 2, 
                                        n_estimators = 2000, 
                                        random_state = 1)

In [90]:
### Choose best-performing model to tune using random hyperparameter grid

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

### Create random grid
random_grid = {'classifier__n_estimators': n_estimators,
               'classifier__max_features': max_features,
               'classifier__max_depth': max_depth,
               'classifier__min_samples_split': min_samples_split,
               'classifier__min_samples_leaf': min_samples_leaf}

print(random_grid)

{'classifier__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'classifier__max_features': ['auto', 'sqrt'], 'classifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'classifier__min_samples_split': [2, 5, 10], 'classifier__min_samples_leaf': [1, 2, 4]}


In [91]:
### Find best combination of parameters using randomized hyperparameter search

random_grid_classifier = RandomizedSearchCV(model_pipeline, param_distributions = random_grid, n_iter = 100, cv = cv, verbose=2, random_state=42, n_jobs = -1)

random_grid_classifier.fit(X_train, y_train)

print(random_grid_classifier.best_params_)

print(random_grid_classifier.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 32.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 74.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 166.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 239.8min finished
{'classifier__n_estimators': 2000, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'sqrt', 'classifier__max_depth': None}
0.37238393071926873


In [None]:
### Fitting pipeline to train data

model_pipeline.fit(X_train, y_train)

### Assess model performance on test data

print("model score: %.3f" % model_pipeline.score(X_test, y_test))

In [None]:
### Pipe different features in with a name so the step can be later called for details

pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams
        ('ngram_all', Ngrams(X_train[['answer_text_clean']]))
    ])),
     # Classifier
     ('kneighbors', KNeighborsClassifier(n_neighbors=5, leaf_size=40))])

In [None]:
### Cross validation and tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'kneighbors__n_neighbors': (3, 5, 10),
            'kneighbors__leaf_size': (10, 20 , 30),
            'kneighbors__p': (1,2)
            }