# Real Smart Model Iterating

It's time to model smarter

In [1]:
# Standard stuff
import pandas as pd
import string

# Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 

# Modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier



In [2]:
df = pd.read_csv('../data/final_plots.csv').drop('Unnamed: 0', axis=1)

def bin_scores(score):
    if score > 60:
        if score > 75:
            return 2
        return 1
    return 0

df['success'] = df['Critic_Score'].apply(bin_scores)

def clean_text(text):
    
    # Removes punctuation
    words = [''.join(ch for ch in s if ch not in string.punctuation)\
             for s in text.split()]
    
    # Returns the lower-case string
    return ' '.join(words).lower()

df.plots = df.plots.apply(clean_text)

In [3]:
stops = stopwords.words('english') + ['game', 'player', 'gameplay']
df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,plots,success
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E,wii sports consists of five separate sports ga...,2
1,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E,mario kart wii is a kart racing game featuring...,2
2,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E,wii sports resort is a sports video game set i...,2
3,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.28,9.14,6.5,2.88,29.8,89.0,65.0,8.5,431.0,Nintendo,E,mario and princess peach are walking together ...,2
4,Wii Play,Wii,2006.0,Misc,Nintendo,13.96,9.18,2.93,2.84,28.92,58.0,41.0,6.6,129.0,Nintendo,E,wii play is a party game consisting of nine mi...,0


We need to tag our parts of speech so we can lemmatize them effectively 

In [4]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

Continuing from the last notebook, I'd like to start by making a pipeline so we can get some cross-validation going without worrying about data leakage.

For the first pipeline we'll stick with the random forest

In [5]:
def big_lemmatizer(text):
    tokens = [word for word in text.split() if word not in stops]
    tags = pos_tag(tokens)
    tags = [(word[0], get_wordnet_pos(word[1])) for word in tags]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word[0], word[1]) for word in tags]
    return ' '.join(words)
df.plots = df.plots.apply(big_lemmatizer)

In [6]:
df.plots.apply(clean_text)
y = df[['success']]
X = df[['plots']]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
gs = GridSearchCV(pipe, {})

NameError: name 'pipe' is not defined

In [None]:
tf = TfidfVectorizer(stop_words='english', max_features=50)
X_train_tf = tf.fit_transform(X_train.plots)
X_test_tf = tf.transform(X_test.plots)



In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_tf, y_train.values.ravel())

In [None]:
print('train acc: ', rf.score(X_train_tf, y_train))
print('test acc: ', rf.score(X_test_tf, y_test))

In [None]:
pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words=stops)),
    ('rf', RandomForestClassifier())
])

In [None]:
params = {'tf__max_features': (20,50,100,500),
          'tf__ngram_range': ((1,1), (1,2)),
          'rf__max_depth': (5, 10, 20, 100, None)}
gs = GridSearchCV(estimator=pipe, param_grid=params, return_train_score=True)

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

In [None]:
for classifier in classifiers:
    
    classifier.fit(X_train_tf, y_train.values.ravel())   
    print(classifier)
    print("model score: %.3f" % classifier.score(X_test_tf, y_test.values.ravel()))

In [None]:
df.plots = df.Platform.apply(str.lower) + ' ' + df.plots

In [None]:
df.head()