<h1>Classification and Data Modeling</h1>

In [87]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

#importing classifiers
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [88]:
#saving scraped data to pandas dataframes
politics = pd.read_csv("../data/Politics_data_full.csv", index_col = 0)
politics.text.replace('[]', np.nan, inplace=True)
politics = politics = politics.dropna().reset_index(drop=True)

money = pd.read_csv("../data/Medium_Money_Data_Final.csv", index_col = 0)
money.text.replace("[]", np.nan, inplace=True)
money = money.dropna().reset_index(drop=True)

art = pd.read_csv("../data/Medium_Articles_Art_Data.csv", index_col = 0)
art.text.replace("[]", np.nan, inplace=True)
art = art.dropna().reset_index(drop=True)

politics.text, money.text, art.text

(0       ['[Photo courtesy of Pixabay, There is a diffe...
 1       ['[image courtesy of Pixabay, Last night I was...
 2       ['[I can’t write about politics today. It’s to...
 3       ['[Lethal autonomous weapon systems (LAWS) are...
 4       ['[Photo courtesy of Pixabay, There is a diffe...
                               ...                        
 3143    ['[Candace Owens, an African-American commenta...
 3144    ['[Byrne Hobart makes some interesting points ...
 3145    ['[If karma were to exist, it’s hard to imagin...
 3146    ['[I’ve written before about potentially fatal...
 3147    ['[Migration scored with film audiences in 201...
 Name: text, Length: 3148, dtype: object,
 0       ['[Potential ways to Save using a Free Library...
 1       ['[By Tara Lachapelle, Netflix Inc. broke the ...
 2       ['[By Noah Smith, The great irony of the ironi...
 3       ['[By Faye Flam, It’s become a kind of sport t...
 4       ['[No matter what lifestyle you plan in retire...
              

<h2>NLP</h2>

In [98]:
#politics data
str1 = politics.text.sample(n=900)

#money data 
str2 = money.text.sample(n=900)

#art data
str3 = art.text.sample(n=900)

str1, str2, str3

(1044    ['[Donald Trump and Republican crony Tom Cotto...
 2511    ['[North Korea more-or-less steals one of Trum...
 2116    ["[Recently, I announced the kickoff of the so...
 715     ['[Speaker of the House Nancy Pelosi announced...
 549     ['[In less than a day after White House gave i...
                               ...                        
 703     ["[While there has been much debate over why t...
 626     ['[By Eric Levitz, Earlier this year, Elizabet...
 2716    ['[President Trump refuses to acknowledge that...
 147     ['[Based on the research of Tricia Moravec, Te...
 762     ['[And it’s all because of the two-party syste...
 Name: text, Length: 900, dtype: object,
 852     ['[After publishing my last article about Carb...
 354     ['[You’re no doubt aware that the more financi...
 582     ['[Recently, personal finance expert and TV pe...
 445     ['[WeWork’s business model is simple. It lease...
 88      ['[The online landscape (especially places lik...
               

In [99]:
#puts all strings together
all_str = pd.concat([str1,str2,str3]).reset_index(drop=True)

In [101]:
#create the vector
#focus on max_features as well
vector = tfidf(stop_words = "english", strip_accents = 'ascii', max_features = 50)

#fit the data
vector.fit(all_str)

#can look at the vectorizer
print(vector.get_feature_names())

['000', 'american', 'art', 'artist', 'artists', 'better', 'day', 'debt', 'did', 'different', 'does', 'dont', 'financial', 'going', 'good', 'im', 'income', 'just', 'know', 'life', 'like', 'long', 'look', 'make', 'making', 'market', 'money', 'need', 'new', 'painting', 'pay', 'people', 'political', 'president', 'really', 'right', 'say', 'thats', 'things', 'think', 'time', 'trump', 'use', 'want', 'way', 'work', 'world', 'year', 'years', 'youre']


In [102]:
#transforming the data, aka manipulating it based on the weights we 
#determined before by fitting our data 
train = vector.transform(all_str)

In [103]:
#making and cleaning test data
test1 = politics.text

test2 = money.text

test3 = art.text

test_set = pd.concat([test1, test2, test3]).reset_index(drop=True)

In [104]:
#still need to preprocess (in string form)
test_trans = vector.transform(test_set)

In [105]:
train_Y = [0] * 900 + [1] * 900 + [2] * 900
test_Y = [0] * test1.size + [1] * test2.size + [2] * test3.size

In [109]:
#create the model (look at the documentation to see if there's any 
# parameters) you should pay closer attention to.

#decision tree
dt = DecisionTreeClassifier(max_depth = 19)
dt.fit(train, train_Y)

#knn
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(train, train_Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [110]:
dt.score(test_trans, test_Y), knn.score(test_trans, test_Y)

(0.9275970619097587, 0.9179783140958377)

In [108]:
dt_scores = []
knn_scores = []
for i in range(25):
    dt = DecisionTreeClassifier(max_depth = i+1)
    dt.fit(train, train_Y)
    dt_scores.append(dt.score(test_trans, test_Y))
    
    knn = KNeighborsClassifier(n_neighbors = i+1)
    knn.fit(train, train_Y)
    knn_scores.append(knn.score(test_trans, test_Y))
    
dt_scores.index(max(dt_scores))+1, knn_scores.index(max(knn_scores))+1

(19, 2)