# Project 6 - Pipleline 3 - Train

## Mine: Select document vectors for all pages from database

In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
import lib.database_module as db

In [3]:
sql = """select * from page;"""
con, cur = db.connect_to_postgres()
pages = pd.read_sql(sql, con=con)
cur.close()
con.close()

Connected to server joshuacook.me.


In [4]:
pages.drop('created_at', axis=1, inplace=True)

In [5]:
# remove data that have pages missing
pages = pages[pages['page'] != '']

## Mine: Select category ids corresponding to pages from database

In [6]:
sql = """
        select page_cate."page_id", category."category_name"
        from page_cate
        join category
        on page_cate."category_id" = category."category_id"
    
      """
con, cur = db.connect_to_postgres()
page_cat_name = pd.read_sql(sql, con=con, columns=['page_id', 'category_id', 'category_name'])
cur.close()
con.close()

Connected to server joshuacook.me.


In [7]:
page_cat_name.head(2)

Unnamed: 0,page_id,category_name
0,19572217,influenza
1,82425,Sandwiches


In [8]:
# merge the 2 dataframe
df = pd.merge(pages, page_cat_name, how='left', on='page_id')

In [9]:
df.head(2)

Unnamed: 0,page_id,title,page,category_name
0,19648694,GT by Citroen,The GT by Citroën (sometimes spelled GTbyCitro...,sports cars
1,38993657,Barbecue sandwich,A barbecue sandwich is a sandwich that is typi...,Sandwiches


In [10]:
# removed category psychology because it was wrongfully uploaded

In [11]:
df = df[df['category_name'] != 'psychology']

In [12]:
# removed all bread pages that went to arcade_games wrongfully

In [13]:
dup_pages = [46641426,48325710,34653956,31940835,40549423,42108065,17000799,17945187,
             5540091,3172418,2205457,7898235,1070039,47514948,2904185,2924882]

In [14]:
df = df[(df['category_name'] != 'Arcade_games') & (~df['page_id'].isin(dup_pages))]

In [15]:
# clean pages
df['page'] = df['page'].str.lower().str.translate(None, string.punctuation)

In [16]:
df.to_pickle('data/df.pkl')

## Refine: Create a data dictionary with training and testing sets

In [19]:
X = df['page']
y = df['category_name']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42)

In [21]:
data_dict = {'X':X,
             'y':y,
             'X_train':X_train,
             'y_train':y_train,
             'X_test':X_test,
             'y_test':y_test,
             'train_score': [], 
             'test_score': []}

## Model: Fit, Score, and Tune a multi-label classification model

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lib.project_5 import general_estimator, general_transformer

In [23]:
lsa = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')),
          ('normalize', Normalizer(copy=False)),
          ('svd', TruncatedSVD(n_components=17,n_iter=10,random_state=42))])

In [24]:
data_dict = general_transformer(lsa, data_dict['X'], data_dict['y'])

In [25]:
data_dict = general_estimator(KNeighborsClassifier(), data_dict)

In [26]:
data_dict['train_score']

[0.93384982121573301]

In [27]:
data_dict['test_score']

[0.89642857142857146]

## Present: Pickle tuned model for later use

In [28]:
model = data_dict['models']

## pickel model

In [None]:
joblib.dump(model, 'data/model.pkl') 