<h1>Classification and Data Modeling</h1>

In [70]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

#importing classifiers
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [71]:
#saving scraped data to pandas dataframes
politics = pd.read_csv("../data/Politics_data_full.csv", index_col = 0)

money = pd.read_csv("../data/Medium_Money_Data_Final.csv", index_col = 0)

art = pd.read_csv("../data/Medium_Articles_Art_Data.csv", index_col = 0)

<h2>NLP</h2>

In [72]:
#there was a bug with the webscraping: people with 2020 claps have the wrong number of claps. 
#checked via website, having differing number of claps
politics = politics[politics.claps != 2020]

#we need to do a bit of preprocessing first! 

#politics data
str1 = politics.text.iloc[:3000]
str1.dropna()

#money data 
str2 = money.text.iloc[:1500] 
str2.dropna()

#art data
str3 = art.text.iloc[:1000]
str3.dropna()

str1, str2, str3

(0       ['[Photo courtesy of Pixabay, There is a diffe...
 1       ['[image courtesy of Pixabay, Last night I was...
 2       ['[I can’t write about politics today. It’s to...
 3       ['[Lethal autonomous weapon systems (LAWS) are...
 4       ['[Photo courtesy of Pixabay, There is a diffe...
                               ...                        
 3190    ['[There’s trouble in centrist paradise as Qui...
 3193                                                   []
 3194    ['[On January 20th, 2017, Donald J. Trump was ...
 3195    ['[Hochman Salkin, IRS Commissioner Charles Re...
 3196    ['[Since the Mueller investigation was launche...
 Name: text, Length: 3000, dtype: object,
 0       ['[Potential ways to Save using a Free Library...
 1       ['[By Tara Lachapelle, Netflix Inc. broke the ...
 2       ['[By Noah Smith, The great irony of the ironi...
 3       ['[By Faye Flam, It’s become a kind of sport t...
 4       ['[The Apple Card, a collaborative product bet...
              

In [73]:
#puts all strings together
all_str = pd.concat([str1,str2,str3]).reset_index(drop=True)
all_str.replace('[]', np.nan, inplace=True)
all_str = all_str.dropna()

In [74]:
#create the vector
#focus on max_features as well
vector = tfidf(stop_words = "english", strip_accents = 'ascii', max_features = 50)

#fit the data
vector.fit(all_str)

#can look at the vectorizer
print(vector.get_feature_names())

['000', 'american', 'art', 'change', 'country', 'day', 'did', 'does', 'dont', 'financial', 'going', 'good', 'government', 'im', 'just', 'know', 'life', 'like', 'long', 'make', 'market', 'money', 'need', 'new', 'pay', 'people', 'political', 'power', 'president', 'public', 'right', 'said', 'say', 'social', 'state', 'states', 'thats', 'things', 'think', 'time', 'trump', 'use', 'want', 'way', 'white', 'work', 'world', 'year', 'years', 'youre']


In [75]:
#transforming the data, aka manipulating it based on the weights we 
#determined before by fitting our data 
train = vector.transform(all_str)

In [76]:
#making and cleaning test data
test1 = politics.text.iloc[3000:]
test1.dropna()

test2 = money.text.iloc[1500:]
test2.dropna()

test3 = art.text.iloc[1000:]
test3.dropna()

test_set = pd.concat([test1, test2, test3]).reset_index(drop=True)
test_set.replace('[]', np.nan, inplace=True)
test_set = test_set.dropna()

In [77]:
#still need to preprocess (in string form)
test_trans = vector.transform(test_set)

In [78]:
train_Y = [0] * all_str.size
test_Y = [0] * test_set.size

In [79]:
#create the model (look at the documentation to see if there's any 
# parameters) you should pay closer attention to.

#decision tree
dt = DecisionTreeClassifier()
dt.fit(train, train_Y)

#knn
knn = KNeighborsClassifier()
knn.fit(train, train_Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [80]:
dt.score(test_trans, test_Y), knn.score(test_trans, test_Y)

(1.0, 1.0)