In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.tree import DecisionTreeClassifier

In [2]:
politics = pd.read_csv("./final_csvs/politics.csv", index_col = 0)
money = pd.read_csv("./final_csvs/money.csv", index_col = 0)
sports = pd.read_csv("./final_csvs/sports.csv", index_col = 0)
tech = pd.read_csv("./final_csvs/tech.csv")

In [3]:
#there was a bug with the webscraping: people with 2020 claps have the wrong number of claps. 
#checked via website, having differing number of claps
politics = politics[politics.claps != 2020]

#get ratio of clap to followers, put as a new feature
clap_ratio = politics.claps / politics.followers

#get number of followers:
followers = politics.followers

#put the clap ratio as a new feature
politics["clap_ratio"] = clap_ratio

In [4]:
politics.head()

Unnamed: 0,url,title,author,username,user_since,following,followers,published,claps,text,tags,clap_ratio
0,https://medium.com/@lizardgrey/lets-pass-on-th...,"Let’s Pass on the Pizzazz, NBC, and Listen to ...",Elizabeth Grey,medium.com/@lizardgrey,2019.0,82.0,57.0,Nov 14,22,"['[Photo courtesy of Pixabay, There is a diffe...","['Politics', 'Impeachment', 'Ukraine', 'Diplom...",0.385965
1,https://medium.com/swlh/politics-isnt-my-passi...,Politics Isn’t My Passion,Elizabeth Grey,medium.com/@lizardgrey,2019.0,82.0,57.0,Nov 12,103,"['[image courtesy of Pixabay, Last night I was...","['Top ', 'Story', 'Submit, ', 'Politics', 'Tru...",1.807018
2,https://medium.com/@lizardgrey/my-no-lies-diet...,The No More Lies Diet Book,Elizabeth Grey,medium.com/@lizardgrey,2019.0,82.0,57.0,Nov 10,2,['[I can’t write about politics today. It’s to...,"['Weight ', 'Loss', 'Self ', 'Improvement', 'E...",0.035088
3,https://medium.com/the-slowdown/how-likely-is-...,How Likely is a Real-life Terminator?,Amie Haven,medium.com/@amiehaven,2019.0,17.0,21.0,Nov 14,104,['[Lethal autonomous weapon systems (LAWS) are...,"['About', 'Pitch ', 'Us ', 'Slalom, ', 'Artifi...",4.952381
4,https://medium.com/@lizardgrey/lets-pass-on-th...,"Let’s Pass on the Pizzazz, NBC, and Listen to ...",Elizabeth Grey,medium.com/@lizardgrey,2019.0,82.0,57.0,Nov 14,22,"['[Photo courtesy of Pixabay, There is a diffe...","['Politics', 'Impeachment', 'Ukraine', 'Diplom...",0.385965


## NLP

In [83]:
#we need to do a bit of preprocessing first! 
#remove pixabay

#politics data
str1 = politics.text.iloc[:10]

#money data 
str2 = money.text.iloc[:10]

In [106]:
all_str = pd.concat([str1,str2]).reset_index(drop=True)
all_str.iloc[:10]

0    ['[Photo courtesy of Pixabay, There is a diffe...
1    ['[image courtesy of Pixabay, Last night I was...
2    ['[I can’t write about politics today. It’s to...
3    ['[Lethal autonomous weapon systems (LAWS) are...
4    ['[Photo courtesy of Pixabay, There is a diffe...
5    ['[What the first day of the House’s public he...
6    ['[To the best of my knowledge, nobody out the...
7    ['[A New York judge ordered on Thursday that P...
8    ['[Bill Gates joined the list of billionaires ...
9    ['[Let’s talk about guns', 'My children had a ...
Name: text, dtype: object

In [95]:
#create the vector
#focus on max_features as well
vector = tfidf(stop_words = "english", strip_accents = 'ascii', max_features = 50)

#fit the data
vector.fit(all_str)

#can look at the vectorizer
print(vector.get_feature_names())

['account', 'air', 'apple', 'bubble', 'bubbles', 'card', 'companies', 'company', 'credit', 'day', 'did', 'gates', 'human', 'ico', 'investment', 'investors', 'just', 'know', 'laws', 'like', 'make', 'market', 'money', 'month', 'new', 'news', 'people', 'plan', 'portfolio', 'president', 'price', 'public', 'retirement', 'right', 'said', 'save', 'say', 'sea', 'south', 'stock', 'stocks', 'time', 'tronics', 'trump', 'try', 'want', 'way', 'write', 'year', 'years']


In [96]:
#transforming the data, aka manipulating it based on the weights we 
#determined before by fitting our data 
train = vector.transform(all_str)

In [97]:
train

<20x50 sparse matrix of type '<class 'numpy.float64'>'
	with 283 stored elements in Compressed Sparse Row format>

In [98]:
train_Y = [0] * 10 + [1] * 10
train_Y

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

### when do we do train test split ?

-when we want to predict something we've never seen before.

In [99]:
#create the test set
test1 = politics.text.iloc[10:15]
test2 = money.text.iloc[10:15]
#test1
test_set = pd.concat([test1, test2]).reset_index(drop=True)

In [100]:
#still need to preprocess (in string form)
test_trans = vector.transform(test_set)

In [101]:
test_trans

<10x50 sparse matrix of type '<class 'numpy.float64'>'
	with 135 stored elements in Compressed Sparse Row format>

In [102]:
# need Y (aka labels since this is supervised)

test_Y = [0] * 5 + [1] * 5

In [103]:
train_Y

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [104]:
#create the model (look at the documentation to see if there's any 
# parameters) you should pay closer attention to.

#this is how you create a basic decision tree
dt = DecisionTreeClassifier()

#fit the model, use your train data
dt.fit(train, train_Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [105]:
dt.score(test_trans, test_Y)

0.4