In [1]:
### Import packages for data manipulation

import pandas as pd
import numpy as np
import re

### Import packages to create absolute file path & make code independent of operating system

from pathlib import Path
import os.path

import warnings
warnings.filterwarnings("ignore")

### Import packages for feature extraction

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [2]:
### Read in dataset

base_path = Path("__file__").parent

full_path = (base_path / "../../data/processed/stackoverflow_preprocessed.csv").resolve()

stackoverflow = pd.read_csv(os.path.join(full_path))

In [None]:
stackoverflow = pd.read_csv("/Users/HenriekeMax/Documents/Career_Development/GitHub/FrauenLoop_NLP_Project_2020/data/processed/stackoverflow_preprocessed.csv")

In [None]:
stackoverflow.head()

In [None]:
### Define function to count 

def wordcounter(x):
    x = len(re.findall(r'\w+', x))
    return x

In [None]:
### Feature extraction
stackoverflow['answer_wordcount'] = stackoverflow['answer_text_clean'].apply(lambda x: wordcounter(x))

In [3]:
stackoverflow.groupby(['score_cat', 'answer_wordcount']).size().unstack(fill_value=0)

answer_wordcount,0,1,2,3,4,5,6,7,8,9,...,1337,1364,1449,1490,1521,1668,1919,1933,2019,2108
score_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bad,4,10,23,36,62,59,80,97,119,137,...,1,0,0,0,0,1,1,0,0,0
good,4,5,16,24,40,54,71,112,95,104,...,0,1,1,0,1,0,0,0,1,1
great,5,10,16,49,79,87,123,129,140,181,...,0,0,0,1,0,0,0,1,0,0


In [4]:
cleanup_nums = {'score_cat': {"bad": 0, "good": 1, "great" : 2}}

stackoverflow_copy = stackoverflow.replace(cleanup_nums, inplace=True)
stackoverflow_copy.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [5]:
stackoverflow_copy['score_cat_int']= stackoverflow_copy.score_cat.astype(int)

AttributeError: 'NoneType' object has no attribute 'score_cat'

In [None]:
df = pd.DataFrame(stackoverflow_copy, columns=['score_cat_int', 'answer_wordcount'])
corrMatrix = df.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [6]:
### Creating a binary feature "code" holding info on whether or not stackoverflow answer contains code

def codecheck(x):
    x = 1 if '<code>' in x else 0
    return x

stackoverflow['code_binary_2'] = stackoverflow['answer_text'].apply(codecheck)

In [7]:
### Creating a binary feature "code" holding info on how many code-snippets an answer contains

def codecheck(x):
    x = x.count("<code>")
    return x

stackoverflow['code_count'] = stackoverflow['answer_text'].apply(codecheck)

In [8]:
stackoverflow['code_binary_2'].value_counts()

1    24427
0     5573
Name: code_binary_2, dtype: int64

In [9]:
stackoverflow.groupby(['score_cat', 'code_binary_2']).size().unstack(fill_value=0)

code_binary_2,0,1
score_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1895,8105
1,1651,8349
2,2027,7973


In [10]:
stackoverflow.groupby(['score_cat', 'code_count']).size().unstack(fill_value=0)

code_count,0,1,2,3,4,5,6,7,8,9,...,69,70,73,74,77,80,85,88,101,102
score_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1895,2786,1748,1046,712,497,324,246,174,132,...,1,0,0,0,0,1,0,0,0,0
1,1651,2575,1729,1092,727,556,386,284,213,135,...,0,1,0,0,0,0,1,1,0,0
2,2027,3444,1712,951,555,299,273,161,113,85,...,0,0,1,1,2,0,0,0,1,1


In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
stackoverflow.head()

In [11]:
### Split into predictors and outcome data

y = stackoverflow['score_cat']
X = stackoverflow.drop(['score_cat', 'score', 'answer_count', 'comment_count', 'creation_date', 'favorite_count', 'view_count'] , axis=1)  

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [13]:
### Compute n grams from a dataframe for a given variable
class Ngrams(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        ### Save name of variable to analyze
        name = df.columns
        #### Initiate TfidfVectorizer
        vectorizer = TfidfVectorizer(strip_accents = 'unicode', use_idf = True, \
                                     stop_words = 'english', analyzer = 'word', \
                                     ngram_range = (1, 2), max_features = 30)
        ### Fit to data
        X_train = vectorizer.fit_transform(df[name[0]].values)
        ### Return sparse matrix
        return X_train
    
    def fit(self, df, y=None):
        ### Unless error returns self
        return self

In [14]:
### Pipe different features in with a name so the step can be later called for details

pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams
        ('ngram_all', Ngrams(X_train[['answer_text_clean']]))
    ])),
     # Classifier
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=52,
                           max_iter=10, tol=10)),])

In [15]:
### Cross validation and tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'clf__alpha': (1e1, 1e3, 1e-5),
            'clf__max_iter': (20, 30)
            }

### Find best model
grid_search = GridSearchCV(pipeline, param_grid, cv=5, iid=False, n_jobs=-1, refit = True)
grid_search.fit(y_train, y_train['score_cat'])

### Print it
print(grid_search.best_score_)
print(grid_search.best_params_)

KeyError: 'score_cat'

In [None]:
### To Do

# Finalize cleaner function (whitespaces etc.)
# Additional features, e.g.
    ### Figure out no. of switches from code to explanation
    ### Extract tags into separate columns and one-hot-encode

# Migrate from notebooks to py scripts and troublshoot issues with wordcounter and not-defined issues
# Incorproate ngram class into code
# Play with different ngram (1,2,3) and max feature numbers
# Incorporate functions/call them in pipeline
# Try out different models
# Hypertune model