In [1]:
# Imports

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import sklearn.preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

import wrangle as w
import explore as e
import model as m

In [2]:
#acquire and prepare data
df = w.get_show_data()

# split data into train, validate and prep
train, validate, test = w.split_my_data(df)

In [3]:
# get set of genres
gen_set = e.get_gens(train)

# get relative word frequency list and list of only the numbers
word_freq = dict(e.get_word_freq(train))
word_counts = e.get_counts(word_freq)

# get relative document frequency list and list of only the numbers
doc_freq = dict(e.get_doc_freq(train))
doc_counts = e.get_counts(doc_freq)

In [4]:
# seperate train, validate and test data into X (description) and y (comedy)
train_X = train[['description']].reset_index(drop=True)
train_y = train[['comedy']].reset_index(drop=True)

validate_X = validate[['description']].reset_index(drop=True)
validate_y = validate[['comedy']].reset_index(drop=True)

test_X = test[['description']].reset_index(drop=True)
test_y = test[['comedy']].reset_index(drop=True)

In [8]:
train_counts, validate_counts, test_counts = m.get_vectorized_data(train_X, validate_X, test_X, CountVectorizer())

In [9]:
def remove_low_freq_from_X(df_X, freq_dict, threshold):
    
    df_cols = list(df_X.columns)
    print(len(df_cols))
    new_cols = [col for col in df_cols if abs(freq_dict[col]) > threshold]
    print(len(new_cols))
    return df_X[new_cols]

In [10]:
remove_low_freq_from_X(train_counts, word_freq , 0)

14815
13664


Unnamed: 0,aaliya,aardman,aba,aback,abagnale,abah,abandoned,abang,abbott,abby,...,zoo,zor,zorel,zoya,zulu,zumba,zumbo,zuo,zuri,zuru
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3239,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
str(round(.1612, 2) * 100) + "%"

'16.0%'

In [15]:
round(.1612 * 100, 2) 

16.12