In [None]:
### To Do

# one-hot-encode question tags
# try word embeddings using gensim

In [1]:
### Import packages to create absolute file path & make code independent of operating system

from pathlib import Path
import os.path

import warnings
warnings.filterwarnings("ignore")

### Import packages for data manipulation

import pandas as pd
import numpy as np
import re

### Import packages to visualize data

import matplotlib.pyplot as plt
import seaborn as sns

### Import packages for feature extraction

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher

### Import packages for modeling
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

### Import packages for model selection and performance assessment
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, StratifiedKFold, cross_val_score, RandomizedSearchCV, GridSearchCV, learning_curve
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss, classification_report, precision_recall_fscore_support
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, mean_squared_error, f1_score

In [2]:
### Read in dataset

print(os.getcwd())

base_path = Path("__file__").parent
full_path = (base_path / "../../data/processed/stackoverflow_preprocessed.csv").resolve()
# Depending on running this in interactive shell vs. terminal, I need to include GitHub/FrauenLoop_NLP_Project_2020 in filepath or not...

stackoverflow = pd.read_csv(os.path.join(full_path))

/Users/HenriekeMax/Documents/Career_Development/GitHub/Predicting-Helpfulness-Of-Stackoverflow-Answers/src/feature_extraction


In [3]:
stackoverflow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            30000 non-null  int64 
 1   score                 30000 non-null  int64 
 2   question_title        30000 non-null  object
 3   question_text         30000 non-null  object
 4   answer_count          30000 non-null  int64 
 5   comment_count         30000 non-null  int64 
 6   creation_date         30000 non-null  object
 7   tags                  30000 non-null  object
 8   view_count            30000 non-null  int64 
 9   answer_text           30000 non-null  object
 10  score_cat             30000 non-null  int64 
 11  question_title_clean  29982 non-null  object
 12  question_text_clean   29953 non-null  object
 13  tags_clean            29462 non-null  object
 14  answer_text_clean     29341 non-null  object
dtypes: int64(6), object(9)
memory usage:

In [4]:
### Drop all observations / rows with any missing values in the column "answer_text_clean"

stackoverflow = stackoverflow.dropna(how='any', subset=['answer_text_clean'])

In [5]:
### Print out dataset for overview

stackoverflow.head()

Unnamed: 0.1,Unnamed: 0,score,question_title,question_text,answer_count,comment_count,creation_date,tags,view_count,answer_text,score_cat,question_title_clean,question_text_clean,tags_clean,answer_text_clean
0,0,-1,Laravel email template styling,<p>Css is not working in email templates so an...,3,4,2018-01-11 08:00:33.623000+00:00,php|laravel,3230,"<p>Use markdown templates\n<a href=""https://la...",0,laravel email template styling,cs work email template email client look ugly ...,php laravel,use markdown template https laravelcom doc mai...
1,1,-1,Only show a certain count of textboxes in vb.net,<p>I'm trying to make several TextBoxes visibl...,2,2,2020-03-07 19:36:53.527000+00:00,vb.net|textbox,48,<p>I created a list of text boxes and filled i...,0,show certain count textboxes vbnet,im try make several text box visible invisible...,vbnet textbox,create list text box fill form load use list m...
2,2,-1,Is there an easy way to convert a vector int s...,<p>I'm attempting to jump into code again and ...,1,1,2019-09-07 18:45:43.363000+00:00,c++,39,"<p>Like <em>Bob__</em> said, you can use <a hr...",0,easy way convert vector int set string output,im attempt jump code teach ins outs follow cod...,,like bob say use use dropin replacement side n...
3,3,-1,Why am I getting Error while updating the flut...,"<blockquote>\n <p>Waiting for 0 seconds, pres...",1,0,2020-05-29 21:03:49.267000+00:00,flutter,27,<p>Type these commands in your terminal:</p>\n...,0,get error update flutter sdk,wait second press quit check dart version powe...,flutter,type command terminal visit http githubcom flu...
4,4,-1,How to add dynamic buttons using a string in f...,"<pre><code>List list=[""A"",""B"",""C"",""D""];\n</cod...",2,0,2019-07-18 12:12:27.033000+00:00,flutter,302,<p><strong>main.dart</strong> full code</p>\n\...,0,add dynamic button use string flutter,want add char separate raise button generate r...,flutter,maindart full code


In [44]:
### Determining similarity of question and answer
class Similarity(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        
        ### Define similarity function
        def similar(column1, column2):
            return SequenceMatcher(None, column1, column2).ratio()
        ### Calculate similarity score between question and answer
        df_new = df[['answer_text_clean', 'question_text_clean']].copy()
        df_new['similarity_score'] = df_new.apply(lambda x: similar(str(x['answer_text_clean']), str(x['question_text_clean'])), axis = 1)
        # df_new['similarity_score'] = df_new.apply(lambda x: x.similar(df_new['answer_text_clean'], df_new['question_text_clean']), axis = 1).ratio()
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text_clean', 'question_text_clean'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [46]:
### Check if Similarity score class works as desired

similarity_scorer = Similarity(stackoverflow)
stackoverflow_new = similarity_scorer.transform(stackoverflow)
stackoverflow_new.head(10)

Unnamed: 0,similarity_score
0,0.298592
1,0.174387
2,0.044444
3,0.194093
4,0.185567
5,0.304569
6,0.330827
7,0.039216
8,0.176471
9,0.004751


In [51]:
### Determining similarity of question and answer
class JaccardSimilarity(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        
        ### Define Jaccard Similarity function
        def get_jaccard_sim(column1, column2): 
            a = set(column1.split()) 
            b = set(column2.split())
            c = a.intersection(b)
            return float(len(c)) / (len(a) + len(b) - len(c))
            
        ### Calculate similarity score between question and answer
        df_new = df[['answer_text_clean', 'question_text_clean']].copy()
        df_new['jaccard_similarity_score'] = df_new.apply(lambda x: get_jaccard_sim(str(x['answer_text_clean']), str(x['question_text_clean'])), axis = 1)
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text_clean', 'question_text_clean'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [52]:
### Check if Jaccard Similarity score class works as desired

jaccard_similarity = JaccardSimilarity(stackoverflow)
stackoverflow_new = jaccard_similarity.transform(stackoverflow)
stackoverflow_new.head(10)

Unnamed: 0,jaccard_similarity_score
0,0.047619
1,0.06383
2,0.048387
3,0.033333
4,0.0
5,0.04
6,0.0
7,0.074468
8,0.035714
9,0.044444


In [167]:
### Count number of words in an answer

class WordCounter(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        ### Variable name to compute number of words on
        name = df.columns
        ### Make into list
        answer_list = df[name[0]].tolist()
        ### Compute number of words for each answer
        wordcount = [len(re.findall(r'\w+', str(answer))) for answer in answer_list]
        ### Make into a pandas df
        df_new = pd.DataFrame(wordcount)
        ### Add suffix
        df_new = df_new.add_suffix(name)
        return df_new

    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [168]:
### Check if WordCounter class works as desired

wordcounter = WordCounter(stackoverflow[['answer_text_clean']])
stackoverflow_new = wordcounter.transform(stackoverflow[['answer_text_clean']])
stackoverflow_new.head()

Unnamed: 0,"0Index(['answer_text_clean'], dtype='object')"
0,41
1,111
2,39
3,11
4,60


In [169]:
### Determining whether or not answer contains code

class CodeCheck(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        ### Check if answer contains code or not
        df_new = df[['answer_text']].copy()
        df_new['code_binary'] = df_new['answer_text'].str.contains('<code>', regex=False)*1      
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [170]:
### Check if CodeCheck class works as desired

codecheck = CodeCheck(stackoverflow) 
stackover_new = codecheck.transform(stackoverflow)

### Check of possible patterns in code existence and answer score

stackover_new['code_binary'].value_counts()

1    24747
0     5242
Name: code_binary, dtype: int64

In [171]:
### Determining whether or not answer contains code

class CodeCounter(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        ### Check if answer contains code or not
        df_new = df[['answer_text']].copy()
        df_new['code_count'] = df_new['answer_text'].str.count('<code>')     
        ### Drop text
        df_new = df_new.drop(columns = ['answer_text'], axis = 1)
        return df_new
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [172]:
### Check if CodeCheck class works as desired

codecount = CodeCounter(stackoverflow) 
stack_new = codecount.transform(stackoverflow)

stack_new.head()

Unnamed: 0,code_count
0,3
1,7
2,3
3,2
4,1


In [173]:
### Check distribution of code counts

stack_new['code_count'].value_counts().sort_index()

0      5242
1      8973
2      5162
3      3168
4      2071
       ... 
82        1
88        1
107       1
118       1
197       1
Name: code_count, Length: 61, dtype: int64

In [174]:
### Compute n grams from a dataframe for a given variable

class Ngrams(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        ### Save name of variable to analyze
        name = df.columns
        #### Initiate TfidfVectorizer
        vectorizer = TfidfVectorizer(strip_accents = 'unicode', use_idf = True, \
                                     stop_words = 'english', analyzer = 'word', \
                                     ngram_range = (1, 1) , max_features = 300)

        ### Fit to data
        X_train = vectorizer.fit_transform(df[name[0]].values.astype(str))
        # X_train = X_train.toarray()
        # is this needed? how do I address mismatching shape problem

        ### Return sparse matrix
        return X_train
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [122]:
ngrams = Ngrams(stackoverflow['answer_text_clean'])

stackover_new = ngrams.transform(stackoverflow[['answer_text_clean']])

print(stackover_new)

(0, 264)	0.21376069455645963
  (0, 67)	0.3496144910716318
  (0, 32)	0.14504142343225496
  (0, 229)	0.11918110815552452
  (0, 28)	0.18551973473862163
  (0, 106)	0.12699088916672527
  (0, 213)	0.16622235933935484
  (0, 253)	0.20887931301012952
  (0, 171)	0.15155337337365532
  (0, 198)	0.6775235290901117
  (0, 291)	0.3849932484599706
  (0, 60)	0.14016542589463377
  (0, 284)	0.15861121826221056
  (1, 81)	0.12786517017197907
  (1, 279)	0.09146996717578719
  (1, 159)	0.09168507146574839
  (1, 112)	0.06947764321606675
  (1, 135)	0.2579355449519641
  (1, 80)	0.2880733989757071
  (1, 113)	0.2361710738360713
  (1, 184)	0.17775583702049533
  (1, 177)	0.07470821251915367
  (1, 206)	0.33375488597052205
  (1, 257)	0.11515758590075796
  (1, 158)	0.10952496026997699
  :	:
  (29987, 287)	0.030738032401505147
  (29987, 204)	0.19556369744690183
  (29987, 37)	0.017654860179919112
  (29987, 125)	0.075570871606756
  (29987, 41)	0.014808459832736362
  (29987, 163)	0.08753459644622182
  (29987, 179)	0.0210317

In [175]:
### Split into predictors and outcome data

y = stackoverflow['score_cat']
# y = label_binarize(y, classes=[0, 1, 2]) --> to accommodate roc
X = stackoverflow.drop(['score_cat', 'answer_count', 'comment_count', 'creation_date', 'view_count'] , axis=1)

In [176]:
### Split into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [177]:
### Model selection process: Create list of different classifiers/algorithms to try out

classifiers = [
    KNeighborsClassifier(),
    SVC(random_state=1),
    DecisionTreeClassifier(random_state=1),
    RandomForestClassifier(random_state=1),
    GradientBoostingClassifier(random_state=1)
    ]

In [139]:
### Model selection process: Loop through the different classifiers using the pipeline

for classifier in classifiers:
    model_pipeline = Pipeline([
        ('feats', FeatureUnion([
            # Ngrams
            ('ngram', Ngrams(X_train[['answer_text_clean']])),
            # Wordcounter
            ('wordcount', WordCounter(X_train[['answer_text_clean']])),
            # Code contained
            ('codecheck', CodeCheck(X_train)),
            # No. of code snippets
            ('codecounter', CodeCounter(X_train))
            ])),
            # Classifier
            ('classifier', classifier)])
    model_pipeline.fit(X_train, y_train)
    y_predict = model_pipeline.predict(X_test)
    print(classifier)
    print(metrics.classification_report(y_test, y_predict))

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.34      0.59      0.43      3055
           1       0.39      0.17      0.24      3013
           2       0.35      0.30      0.32      2929

    accuracy                           0.35      8997
   macro avg       0.36      0.35      0.33      8997
weighted avg       0.36      0.35      0.33      8997

SVC(random_state=1)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      3055
           1       0.39      0.41      0.40      3013
           2       0.36      0.71      0.47      2929

    accuracy                           0.37      8997
   macro avg       0.25      0.37      0.29      8997
weighted avg       0.25      0.37      0.29      8997

DecisionTreeClassifier(random_state=1)
              precision    recall  f1-score   support

           0       0.35      0.07      0.12      3055
           1       0.39      0.34      0.37    

In [138]:
model_pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams answers
        ('ngram_ans', Ngrams(X_train[['answer_text_clean']])),
        # Wordcounter
        ('wordcount', WordCounter(X_train[['answer_text_clean']])),
        # Code contained
        ('codecheck', CodeCheck(X_train)),
        # No. of code snippets
        ('codecounter', CodeCounter(X_train))
        ])),
        # Classifier
        ('classifier', GradientBoostingClassifier(random_state = 1))])
model_pipeline.fit(X_train, y_train)
y_predict = model_pipeline.predict(X_test)
print(classifier)
print(metrics.classification_report(y_test, y_predict))

GradientBoostingClassifier(random_state=1)
              precision    recall  f1-score   support

           0       0.78      0.03      0.05      3055
           1       0.40      0.43      0.41      3013
           2       0.36      0.71      0.48      2929

    accuracy                           0.38      8997
   macro avg       0.52      0.39      0.31      8997
weighted avg       0.52      0.38      0.31      8997



In [140]:
###

y_test.head()

11108    1
1125     0
2242     0
14480    1
9216     0
Name: score_cat, dtype: int64

In [141]:
### Mapping predicted scores for test data onto actual scores
 
y_predict = model_pipeline.predict(X_test)
y_predict_df = pd.DataFrame(data = y_predict, columns=['predicted_score_cat'], index = X_test.index.copy())
df_test = pd.merge(X_test, y_predict_df, left_index = True, right_index = True)

df_test['score_cat'] = y_test

In [143]:
df_test.head(20)

Unnamed: 0.1,Unnamed: 0,score,question_title,question_text,tags,answer_text,question_title_clean,question_text_clean,tags_clean,answer_text_clean,predicted_score_cat,score_cat
11108,11108,1,No matches for kind ClusterIssuer on a Digital...,"<p>I have been following <a href=""https://www....",kubernetes|digital-ocean|lets-encrypt|kubectl,"<p>Try following <a href=""https://docs.cert-ma...",match kind clusterissuer digital ocean kuberne...,follow guide create nginxingress work fine nex...,kubernetes digitalocean letsencrypt kubectl,try follow link certmanager letsencrypt notify...,2,1
1125,1125,-2,How to do a slideshow in HTML CSS,<p>I want to do a slideshow like this one : <a...,javascript|html|css|slideshow,"<p>I recommend you to use the <a href=""https:/...",slideshow html cs,want slideshow like one httpswww inextenso dig...,javascript html cs slideshow,recommend use slick library work jquery allow ...,2,0
2242,2242,-1,Android: Reading json from sampledata folder,<p>I would like to fill <code>RecyclerView</co...,java|android|json|kotlin,<p>You should do step by step as below or watc...,android read json sampledata folder,would like fill custom file stay sampledata fo...,java android json kotlin,step step watch tutorial use kotlin language c...,1,0
14480,14480,1,MouseX for a wave,<p>I'm new in Processing and I have a kind of ...,processing,"<p><a href=""https://processing.org/reference/l...",mousex wave,im new processing kind easy question add wave ...,processing,return value range define height leave height ...,1,1
9216,9216,-4,Events operators : noEvent,<p>Im using Dymola to model an electrical circ...,modelica|dymola,"<p>As Matti already mentioned, please use inli...",event operator noevent,im use dymola model electrical circuit simulat...,modelica dymola,matti already mention please use inline code t...,1,0
11797,11797,1,Bash find replace multiple files to NEW files,<p>I have a directory of let's call them templ...,bash|ubuntu|sed|find,<p>This might work for you (GNU sed and parall...,bash find replace multiple file new file,directory let call template file one value nee...,bash ubuntu sed find,might work gnu sed parallel sub find template ...,1,1
11219,11219,1,Is it bad practice to add CR/LF's to logging m...,<p>We have a method which we use to log java e...,java|logging|owasp,<p>generally - yes.</p>\n\n<p>while this seems...,bad practice add crlfs log message improve rea...,method use log java exception log file method ...,java log owasp,generally yes seem nice feature first glance c...,2,1
7330,7330,-2,How to load content on touches or mouseover?,<p>How to load an <code>HTML</code> content on...,javascript|html,<p>For a smooth transition:</p>\n\n<pre><code>...,load content touch mouseover,load content mouseover help javascript example...,javascript html,smooth transition mydiv opacity transition opa...,2,0
25418,25418,57,"WARNING : No target specified, deploying to em...",<p>I'm trying to run an ionic app on my mobile...,android|cordova|ionic,<p>Once you are remove the USB plugin from the...,warn target specify deploy emulator,im try run ionic app mobile phone android usbd...,android cordova ionic,remove usb plugin computer mean work already o...,2,2
15831,15831,1,Room persistence with retrofit nested object r...,<p>I am trying to store some retrofit response...,android|kotlin|retrofit|android-room,<p>Try to make use of <code>@Embedded</code> a...,room persistence retrofit nest object response,try store retrofit response room db retrofit r...,android kotlin retrofit androidroom,try make use annotation like show way need sec...,1,1


In [145]:
df_test.groupby(['score_cat', 'predicted_score_cat']).size().unstack(fill_value=0)

predicted_score_cat,0,1,2
score_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,78,1048,1929
1,12,1282,1719
2,10,843,2076


In [147]:
# Examining if correctly predicted bad answers tend to have negative scores
df_test_new = df_test[(df_test['score_cat'] == 0) & 
          (df_test['predicted_score_cat'] == 0)] 

df_test_new.head(30)

Unnamed: 0.1,Unnamed: 0,score,question_title,question_text,tags,answer_text,question_title_clean,question_text_clean,tags_clean,answer_text_clean,predicted_score_cat,score_cat
4685,4685,-5,Project Euler #3 Python,"<p>This is my solution to <a href=""https://pro...",python,<p>finding factors of N only need to check upt...,project euler python,solution project euler problem write code proj...,python,find factor need check upto first basic solu...,0,0
9784,9784,-1,Can someone explain this little piece of code ...,<p>The output is :</p>\n\n<pre><code>25\n28ff1...,c,<p><em>...but why is the second output the sam...,someone explain little piece code,output ffc ffc understand first value address ...,,second output same original code float add int...,0,0
433,433,-2,'which' command outputs nothing in Ubuntu,<p>I want to figure out path of the executable...,linux|bash|shell|ubuntu,<p>If <code>which exe</code> does not return a...,which command output nothing ubuntu,want figure path executable command say nothin...,linux bash shell ubuntu,return string mean executable item script bina...,0,0
8059,8059,-1,How to find out which window triggered WM_PARE...,<p>When I set the parent of a button (A) to an...,c++|windows|user-interface|winapi|frontend,"<p>Yes, it's sure that it seems strange that s...",find window trigger wmparentnotify,set parent button another button find trigger...,window userinterface winapi frontend,yes sure seem strange somebody put button anot...,0,0
4850,4850,-1,Add Language colors for the repositories using...,<p>I wanted to display all repositories of a u...,javascript|rest|api|github,<p>a simple Google search returns several repo...,add language color repository use rest api,want display repository username use rest api ...,javascript rest api github,simple google search return several repos data...,0,0
3215,3215,-1,My filter app with the following lines of code...,<p>I am using a box blur algorithm to create a...,c|image|blur|cs50,<p>Nasty little typos...</p>\n\n<p>I stripped ...,filter app follow line code produce blur effec...,use box blur algorithm create blur filter imag...,image blur,nasty little typo strip line could obviously c...,0,0
4621,4621,-5,char* buffer = new vs char buffer[] in C++,<pre><code>1. char* buffer = new char[size]\n2...,c++,<pre><code>char* buffer = new char[size]\n</co...,char buffer new char buffer,char buffer new charsize char buffersize im ne...,,char buffer new charsize portable avoid really...,0,0
5401,5401,-1,React.js library that does same thing as react...,<p>I'm relatively new to react.js &amp; react-...,javascript|reactjs|react-native,"<p>You just haven't asked the internet, But th...",react library thing reactnativerouterflux rea...,im relatively new react amp reactnative wonde...,javascript reactjs reactnative,ask internet go httpsreacttraining comreactrouter,0,0
8906,8906,-1,Program instantly stopping in C?,<p>I have written this code:</p>\n\n<pre><code...,c,"<p>When you run a console application, a termi...",program instantly stop,write code include ltstdio hgt int main printf...,,run console application terminal window create...,0,0
7501,7501,-2,How to write leap year detection in C?,"<p>I posted a blog not long ago, but I have an...",c,<p>You can add the following condition to chec...,write leap year detection,post blog long ago another question concern pr...,,add follow condition check day range dd lt amp...,0,0


In [134]:
### Selecting rows from df_test based on condition that predicted and actual score diverge by 2
divergence_df = df_test[(df_test['predicted_score_cat'] - df_test['score_cat'] == abs(2))]

divergence_df.head(40)

Unnamed: 0.1,Unnamed: 0,score,question_title,question_text,tags,answer_text,question_title_clean,question_text_clean,tags_clean,answer_text_clean,predicted_score_cat,score_cat
1125,1125,-2,How to do a slideshow in HTML CSS,<p>I want to do a slideshow like this one : <a...,javascript|html|css|slideshow,"<p>I recommend you to use the <a href=""https:/...",slideshow html cs,want slideshow like one httpswww inextenso dig...,javascript html cs slideshow,recommend use slick library work jquery allow ...,2,0
2242,2242,-1,Android: Reading json from sampledata folder,<p>I would like to fill <code>RecyclerView</co...,java|android|json|kotlin,<p>You should do step by step as below or watc...,android read json sampledata folder,would like fill custom file stay sampledata fo...,java android json kotlin,step step watch tutorial use kotlin language c...,2,0
9216,9216,-4,Events operators : noEvent,<p>Im using Dymola to model an electrical circ...,modelica|dymola,"<p>As Matti already mentioned, please use inli...",event operator noevent,im use dymola model electrical circuit simulat...,modelica dymola,matti already mention please use inline code t...,2,0
7330,7330,-2,How to load content on touches or mouseover?,<p>How to load an <code>HTML</code> content on...,javascript|html,<p>For a smooth transition:</p>\n\n<pre><code>...,load content touch mouseover,load content mouseover help javascript example...,javascript html,smooth transition mydiv opacity transition opa...,2,0
3204,3204,-1,How can I create heightmap for QHeightMapSurfa...,<p>I have the data - 2D discrete Fourier trans...,c++|qt|heightmap,<p><code>QHeightMapSurfaceDataProxy</code>'s c...,create heightmap qheightmapsurfacedataproxy a...,data discrete fourier transform result want g...,qt heightmap,constructor take image image file argument ne...,2,0
1961,1961,-1,DB2 database backup encryption,<p>I want to take encrypted backup of my exist...,encryption|db2|db2-luw,<p>DB2 encryption feature is introduced in 10....,db database backup encryption,want take encrypt backup exist database encryp...,encryption db dbluw,db encryption feature introduce fix pack,2,0
7776,7776,-1,Get Substring from Value as key in an array,<p>I have an array as below.</p>\n\n<pre><code...,php|arrays,<p>This should get it all in one array as you ...,get substring value key array,array as array array gt name sip gt status arr...,php array,get one array request newarray foreach as inne...,2,0
5565,5565,-4,Map of slice of maps,<p>I have the following snippet of code from a...,list|dictionary|go,<p>I assume the <code>res</code> object are th...,map slice map,follow snippet code function return http respo...,list dictionary go,assume object slicearray value item map someth...,2,0
1907,1907,-1,Bottom off the site is cut off when scroll is up,<p>I have tried so many iterations but I dont ...,html|css,<p>I fix it adding height:calc(100% - 42px); t...,bottom site cut scroll,try many iteration dont know change toppx kont...,html cs,fix add heightcalc px kontenerauto look nice,2,0
4301,4301,-1,Hyperledger fabric web application,<p>How to build web applications for hyperledg...,web-applications|hyperledger-fabric,<p>Hyperledger fabric is a distributed Blockch...,hyperledger fabric web application,build web application hyperledger fabric netwo...,webapplications hyperledgerfabric,hyperledger fabric distribute blockchain syste...,2,0


In [148]:
divergence_df.groupby(['score', 'predicted_score_cat']).size().unstack(fill_value=0)

predicted_score_cat,2
score,Unnamed: 1_level_1
-22,1
-11,1
-10,1
-8,4
-7,5
-6,13
-5,47
-4,104
-3,245
-2,655


In [149]:
### Define the model cross-validation configuration

cv = KFold(n_splits=5, shuffle=True, random_state=1)

In [150]:
cross_val_score(model_pipeline, X_train, y_train, cv=cv)

array([0.39533222, 0.40747797, 0.39328252, 0.39042401, 0.41924726])

In [156]:
### Create param grid based on results from random grid search

param_grid = {'classifier__n_estimators': [3000],
               'classifier__max_features': ['sqrt'],
               'classifier__max_depth': [None],
               'classifier__min_samples_split': [1, 2],
               'classifier__min_samples_leaf': [1, 2]}

print(param_grid)

{'classifier__n_estimators': [3000], 'classifier__max_features': ['sqrt'], 'classifier__max_depth': [None], 'classifier__min_samples_split': [1, 2], 'classifier__min_samples_leaf': [1, 2]}


In [157]:
### Choose best-performing model to tune using GridSearchCV

grid_classifier = GridSearchCV(model_pipeline, param_grid = param_grid, cv=cv, iid=False, n_jobs=-1, refit = True)
# scoring='roc_auc' --> reincorporate

In [158]:
grid_classifier.fit(X_train, y_train)

print("Best result: %f using parameters %s" % (grid_classifier.best_score_, grid_classifier.best_params_))

Best result: 0.402153 using parameters {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 3000}


In [159]:
### Assess model performance on test data
print("Model Score assessed on test data: %.3f" % grid_classifier.score(X_test, y_test))

print("Classification Report:", classification_report(y_test, grid_classifier.predict(X_test)))

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

Model Score assessed on test data: 0.378
Classification Report:               precision    recall  f1-score   support

           0       0.39      0.09      0.14      3055
           1       0.41      0.35      0.38      3013
           2       0.36      0.71      0.48      2929

    accuracy                           0.38      8997
   macro avg       0.39      0.38      0.33      8997
weighted avg       0.39      0.38      0.33      8997



In [151]:
### Define classifier with tuned parameters

classifier = GradientBoostingClassifier(max_depth = None, 
                                        max_features = 'sqrt', 
                                        min_samples_leaf = 1,
                                        min_samples_split = 2, 
                                        n_estimators = 2000, 
                                        random_state = 1)

In [90]:
### Choose best-performing model to tune using random hyperparameter grid

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

### Create random grid
random_grid = {'classifier__n_estimators': n_estimators,
               'classifier__max_features': max_features,
               'classifier__max_depth': max_depth,
               'classifier__min_samples_split': min_samples_split,
               'classifier__min_samples_leaf': min_samples_leaf}

print(random_grid)

{'classifier__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'classifier__max_features': ['auto', 'sqrt'], 'classifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'classifier__min_samples_split': [2, 5, 10], 'classifier__min_samples_leaf': [1, 2, 4]}


In [91]:
### Find best combination of parameters using randomized hyperparameter search

random_grid_classifier = RandomizedSearchCV(model_pipeline, param_distributions = random_grid, n_iter = 100, cv = cv, verbose=2, random_state=42, n_jobs = -1)

random_grid_classifier.fit(X_train, y_train)

print(random_grid_classifier.best_params_)

print(random_grid_classifier.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 32.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 74.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 166.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 239.8min finished
{'classifier__n_estimators': 2000, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'sqrt', 'classifier__max_depth': None}
0.37238393071926873


In [None]:
### Fitting pipeline to train data

model_pipeline.fit(X_train, y_train)

### Assess model performance on test data

print("model score: %.3f" % model_pipeline.score(X_test, y_test))

In [None]:
### Pipe different features in with a name so the step can be later called for details

pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams
        ('ngram_all', Ngrams(X_train[['answer_text_clean']]))
    ])),
     # Classifier
     ('kneighbors', KNeighborsClassifier(n_neighbors=5, leaf_size=40))])

In [None]:
### Cross validation and tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'kneighbors__n_neighbors': (3, 5, 10),
            'kneighbors__leaf_size': (10, 20 , 30),
            'kneighbors__p': (1,2)
            }

In [None]:
### Find best model
grid_search = GridSearchCV(pipeline, param_grid, cv=5, iid=False, n_jobs=-1, refit = True)
grid_search.fit(X_train, y_train)

In [None]:
### Print best model
print("Best score:", grid_search.best_score_)
print("Best params:", grid_search.best_params_)

print(classification_report(y_test, grid_search.predict(X_test)))

In [None]:
### To Do

# Finalize cleaner function (whitespaces etc.)
# Additional features, e.g.
    ### Figure out no. of switches from code to explanation
    ### Extract tags into separate columns and one-hot-encode

# Play with different ngram (1,2,3) and max feature numbers
# Incorporate functions/call them in pipeline
# Try out different models
# Hypertune model


# Add cross validation
# Look at mispredictions to make more targeted features
# Make those features
# Model and you can also try additional types of models