___
Team Member Names
- Name 1: Matthew D. Cusack
- Name 2: Tim Cabaza
- Name 3: Amy Adyanthaya

<a id="top"></a>
________
# Classification
____

## Contents
* <a href="#DataPrep1">Data Preparation Part 1</a>
* <a href="#DataPrep2">Data Preparation Part 2</a>
* <a href="#ModelEval1">Modeling and Evaluation 1</a>
* <a href="#ModelEval2">Modeling and Evaluation 2</a>
* <a href="#ModelEval3">Modeling and Evaluation 3</a>
    * <a href="#RFmodel">Random Forest Model</a>
    * <a href="#KNNmodel">KNN Model</a>
    * <a href="#SVMmodel">SVM Model</a>
* <a href="#ModelEval4">Modeling and Evaluation 4</a>
* <a href="#ModelEval5">Modeling and Evaluation 5</a>
    * <a href="#TaskEval">Comparing Task Performance Between Different Types of Models</a>
        * <a href="#sqrTask">On "share_quantile_range" Task</a>
        * <a href="#dowTask">On "day_of_week" Task</a>
* <a href="#ModelEval6">Modeling and Evaluation 6</a>
* <a href="#Deployment">Deployment</a>
* <a href="#Exceptional">Exceptional Work</a>
    * <a href="#ncTask">Comparing Task Performance Between Different Types of ModelsOn "news_category" Task</a>
    * <a href="#ScalerEval">Comparing the StandardScalar and QuantileTransformer Versions of The Models</a>
        * <a href="#RFEval">Random Forest Models</a>
        * <a href="#KNNEval">KNN Models</a>
        * <a href="#SVMEval">SVM Models</a>
_______

In [1]:
# Load Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# this import allows you train and test you test split
from sklearn.model_selection import train_test_split
# this import allows you to standardize your data, scaling so that all features have a mean of zero and a standard deviation of 1. 
from sklearn.preprocessing import StandardScaler, QuantileTransformer
# this import allows you to create a logistic regression model; type of machine learning model that can be used for classification tasks 
from sklearn.linear_model import LogisticRegression
# this import allows you to create a support vector machine SVM model, a type of ML model that can be used for classification tasks. 
from sklearn.svm import SVC
# this import allows you to perform CV on your model, a technique for evaluating the performance of a ML on unseen data
from sklearn.model_selection import cross_val_score
# these imports allow you to calculate various evaluation metrics for your ML model. Eval metrics are used to asses the performance of a ML on held-out test set. 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
# for testing differences with 95% confidence
from scipy.stats import ttest_rel
# for RandomForest models
from sklearn.ensemble import RandomForestClassifier
# for KNN models
from sklearn.neighbors import KNeighborsClassifier
# for feature selection
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.model_selection import StratifiedKFold


In [2]:
# file path
file_path = "../1 - Visualization and Data Preprocessing/Data/ONPClean2.csv" # previously cleaned
# file_path = '../1 - Visualization and Data Preprocessing/Data/OnlineNewsPopularity.csv' # unclean

# Load the dataset
df = pd.read_csv(file_path)

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,url_name,date,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,day_of_week,news_category,year,month,log_shares,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,amazon-instant-video-browser/,2013-01-07,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,Monday,Entertainment,2013,1,6.386879,5.393628,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,reeddit-reddit/,2013-01-07,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022265,0.022446,0.022276,0.251465,0.681548,0.381987,0.152189,0.038462,0.007692,0.833333,0.166667,0.353939,0.033333,0.7,-0.4,-0.4,-0.4,0.25,0.2,0.25,0.2,1300,Monday,Tech,2013,1,7.170888,4.875197,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,rage-comics-dying/,2013-01-07,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028575,0.199626,0.028615,0.714611,0.028572,0.54258,0.12237,0.063291,0.025316,0.714286,0.285714,0.357269,0.05,0.6,-0.338889,-1.0,-0.05,0.65,-0.5,0.15,0.5,1100,Monday,Uncategorized,2013,1,7.003974,6.163315,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,power-matters-alliance-organization/,2013-01-07,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020011,0.020317,0.117255,0.020007,0.82241,0.425089,0.128515,0.03964,0.012613,0.758621,0.241379,0.337965,0.05,0.7,-0.225794,-0.4,-0.125,0.5,-0.1,0.0,0.1,1600,Monday,Tech,2013,1,7.378384,6.320768,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,polaroid-android-camera/,2013-01-07,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025001,0.327017,0.025001,0.025001,0.597981,0.50652,0.279769,0.071749,0.013453,0.842105,0.157895,0.417055,0.1,1.0,-0.212354,-0.5,-0.05,0.333333,0.25,0.166667,0.25,2400,Monday,Tech,2013,1,7.783641,7.017506,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


_________
<a href="#top">Back to Top</a>
<a id="DataPrep1"></a>
# Data Preparation Part 1 [10 points]

## Class Variables

In [26]:
df.info()

AttributeError: 'list' object has no attribute 'info'

##### Original (before any cleaning):
url:        
    Containes the url of the article with the date      
    Object

timedelta:               
    Days between the article publication and the dataset acquisition (non-predictive)               
    float64

n_tokens_title:               
    Number of words in the title               
    float64

n_tokens_content:               
    Number of words in the content               
    float64

n_unique_tokens:               
    Rate of unique words in the content               
    float64

n_non_stop_words:           
    Rate of non-stop words in the content           
    float64

n_non_stop_unique_tokens:      
    Rate of unique non-stop words in the content      
    float64

num_hrefs:                    
    Number of links                 
    float64

num_self_hrefs:               
    Number of links to other articles published by Mashable            
    float64

num_imgs:                      
    Number of images        
    float64

num_videos:                    
    Number of videos            
    float64
    
average_token_length:               
    Average length of the words in the content               
    Float64

num_keywords:               
    Number of keywords in the metadata               
    float64

data_channel_is_lifestyle:     
    Is data channel 'Lifestyle'?            
    Binary (Yes = 1 / No = 0)       
    float64

data_channel_is_entertainment:          
    Is data channel 'Entertainment'?            
    Binary (Yes = 1 / No = 0)       
    float64

data_channel_is_bus:           
    Is data channel 'Business'?         
    Binary (Yes = 1 / No = 0)       
    float64
  
data_channel_is_socmed:        
    Is data channel 'Social Media'?             
    Binary (Yes = 1 / No = 0)       
    float64
   
data_channel_is_tech:          
    Is data channel 'Tech'?             
    Binary (Yes = 1 / No = 0)       
    float64
 
data_channel_is_world:         
    Is data channel 'World'?        
    Binary (Yes = 1 / No = 0)       
    float64
 
kw_min_min:               
    Worst keyword (min. shares)               
    float64

kw_max_min:                    
    Worst keyword (max. shares)         
    float64

kw_avg_min:                    
    Worst keyword (avg. shares)               
    float64

kw_min_max:                    
    Best keyword (min. shares)          
    float64

kw_max_max:                    
    Best keyword (max. shares)               
    float64

kw_avg_max:                    
    Best keyword (avg. shares)               
    float64

kw_min_avg:                    
    Avg. keyword (min. shares)               
    float64

kw_max_avg:                    
    Avg. keyword (max. shares)               
    float64

kw_avg_avg:                    
    Avg. keyword (avg. shares)          
    float64

self_reference_min_shares:    
    Min. shares of referenced articles in Mashable          
    float64

self_reference_max_shares:     
    Max. shares of referenced articles in Mashable          
    float64

self_reference_avg_sharess:   
    Avg. shares of referenced articles in Mashable          
    float64

weekday_is_monday:             
    Was the article published on a Monday?          
    Binary (Yes = 1 / No = 0)       
    float64

weekday_is_tuesday:            
    Was the article published on a Tuesday?             
    Binary (Yes = 1 / No = 0)       
    float64

weekday_is_wednesday:          
    Was the article published on a Wednesday?               
    Binary (Yes = 1 / No = 0)       
    float64

weekday_is_thursday:           
    Was the article published on a Thursday?            
    Binary (Yes = 1 / No = 0)       
    float64

weekday_is_friday:             
    Was the article published on a Friday?          
    Binary (Yes = 1 / No = 0)       
    float64

weekday_is_saturday:           
    Was the article published on a Saturday?            
    Binary (Yes = 1 / No = 0)       
    float64

weekday_is_sunday:              
    Was the article published on a Sunday?          
    Binary (Yes = 1 / No = 0)       
    float64

is_weekend:                    
    Was the article published on the weekend?               
    Binary (Yes = 1 / No = 0)       
    float64

LDA_00:                        
    LDA topic modeling 
    Closeness to LDA topic 0               
    float64

LDA_01:                       
    Closeness to LDA topic 1               
    float64

LDA_02:                        
    Closeness to LDA topic 2               
    float64

LDA_03:                       
    Closeness to LDA topic 3               
    float64

LDA_04:                        
    Closeness to LDA topic 4               
    float64

global_subjectivity:           
    Text subjectivity               
    float64

global_sentiment_polarity:     
    Text sentiment polarity               
    float64

global_rate_positive_words:    
    Rate of positive words in the content               
    float64

global_rate_negative_words:    
    Rate of negative words in the content               
    float64

rate_positive_words:           
    Rate of positive words among non-neutral tokens               
    float64

rate_negative_words:           
    Rate of negative words among non-neutral tokens               
    float64

avg_positive_polarity:         
    Avg. polarity of positive words               
    float64

min_positive_polarity:         
    Min. polarity of positive words               
    float64

max_positive_polarity:         
    Max. polarity of positive words               
    float64

avg_negative_polarity:         
    Avg. polarity of negative  words               
    float64

min_negative_polarity:         
    Min. polarity of negative  words               
    float64

max_negative_polarity:         
    Max. polarity of negative  words               
    float64

title_subjectivity:            
    Title subjectivity               
    float64

title_sentiment_polarity:      
    Title polarity               
    float64

abs_title_subjectivity:        
    Absolute subjectivity level               
    float64

abs_title_sentiment_polarity:  
    Absolute polarity level               
    float64

shares:                        
    Number of shares (target)               
    Integer

##### Newly Created (from after preeviously done cleaning & any transformations):
url_name:               
    URL of the article (non-predictive)               
    Float

Date:               
    The date the article was published               
    DateTime

Day_of_week:               
    What day of the week the article is posted on               
    Categorical

news_category:               
    What news category the article is               
    Categorical

Year:               
    The year the article was published               
    Integer

Month:               
    The month the aticle was published               
    Integer

log_shares:               
    log of the "shares" variable               
    Float

log_n_tokens_content:               
    log of the "n_tokens_content" variable               
    Float

log_num_hrefs:               
    log of the "num_hrefs" variable               
    Float

log_num_self_hrefs:               
    log of the "num_self_hrefs" variable               
    Float

log_num_imgs:               
    log of the "num_imgs" variable               
    Float

log_num_videos:               
    log of the "num_videos" variable               
    Float

log_kw_max_min:               
    log of the "kw_max_min" variable               
    Float

log_kw_min_max:               
    log of the "kw_min_max" variable               
    Float

log_kw_avg_avg:               
    log of the "kw_avg_avg" variable               
    Float

log_self_reference_min_shares:               
    log of the "self_reference_min_shares" variable               
    Float

log_self_reference_max_shares:               
    log of the "self_reference_max_shares" variable               
    Float

log_self_reference_avg_shares:               
    log of the "self_reference_avg_shares" variable               
    Float

day_of_weekX where X is the day of the week
    a binary value meaning either Yes (1) it is day X or No (0) it is not day x
     

### Data Pre-Processing

If your dataset has outliers, you may want to remove them before training your model, as outliers can skew the results of the model.

In [3]:
# Remove certain columns before dimensionality reduction can take place

# drop certain columns
df1 = df.drop('url_name', axis=1) # was a string and not helpful
df1 = df1.drop('date', axis=1) # datetime change didn't work.
df1 = df1.drop('log_shares', axis=1) # not as useful

#Factor columns that need it for certain models
# Factor the `news_category` column for other two tasks.
# df1 = pd.get_dummies(df1, drop_first=False,columns=['news_category'])

# Factor the `day_of_week` column for other two tasks.
# df1 = pd.get_dummies(df1, columns=['day_of_week'], drop_first=False)

# drop Na's
df1.dropna()

df1.head()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,day_of_week,news_category,year,month,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,Monday,Entertainment,2013,1,5.393628,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022265,0.022446,0.022276,0.251465,0.681548,0.381987,0.152189,0.038462,0.007692,0.833333,0.166667,0.353939,0.033333,0.7,-0.4,-0.4,-0.4,0.25,0.2,0.25,0.2,1300,Monday,Tech,2013,1,4.875197,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028575,0.199626,0.028615,0.714611,0.028572,0.54258,0.12237,0.063291,0.025316,0.714286,0.285714,0.357269,0.05,0.6,-0.338889,-1.0,-0.05,0.65,-0.5,0.15,0.5,1100,Monday,Uncategorized,2013,1,6.163315,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020011,0.020317,0.117255,0.020007,0.82241,0.425089,0.128515,0.03964,0.012613,0.758621,0.241379,0.337965,0.05,0.7,-0.225794,-0.4,-0.125,0.5,-0.1,0.0,0.1,1600,Monday,Tech,2013,1,6.320768,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025001,0.327017,0.025001,0.025001,0.597981,0.50652,0.279769,0.071749,0.013453,0.842105,0.157895,0.417055,0.1,1.0,-0.212354,-0.5,-0.05,0.333333,0.25,0.166667,0.25,2400,Monday,Tech,2013,1,7.017506,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


In [4]:
# Create share_quantile_ranges_variable for target

# Create bins using quantiles
q1 = df1['shares'].quantile(0.25)
q2 = df1['shares'].quantile(0.5)
q3 = df1['shares'].quantile(0.75)

# Define the bin labels
labels = ['<Q1', 'Q1-Q2', 'Q2-Q3', '>Q3']

# Cut the shares column into bins
df1['share_quantile_ranges'] = pd.cut(df1['shares'], bins=[0, q1, q2, q3, 1000000], labels=labels)

print(df1['share_quantile_ranges'].value_counts())

Q1-Q2    10152
Q2-Q3     9932
<Q1       9930
>Q3       9630
Name: share_quantile_ranges, dtype: int64


In [None]:
# Remove target
# X = df1.drop(['share_quantile_ranges', 'day_of_week', 'news_category'], axis=1)
# y = df1['share_quantile_ranges']
# print(X.columns)
# print(y)

In [None]:
# random_state = 42

# # Create a StratifiedKFold object with n_splits=10
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state = random_state)

In [None]:
# Split the data into training and testing folds
# X_train, X_test, y_train, y_test = [], [], [], []
# for train_index, test_index in kfold.split(X, y):
#     X_train.append(X[train_index])
#     X_test.append(X[test_index])
#     y_train.append(y[train_index])
#     y_test.append(y[test_index])

Scaling data

In [None]:
# Scale the features in the training and testing sets using standard scalar.
# scaler = StandardScaler()

# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
# Scale the features using QuantileTransformer with n_quantiles=100 AFTER using StandardScalar().
# quantile_transformer = QuantileTransformer(n_quantiles=100)

# X_train_q = quantile_transformer.fit_transform(X_train)
# X_test_q = scaler.transform(X_test)

Scaling is done on the training and testing sets "X_train" and "X_test" in order put the data on a common scale. This will helpful in improving the model performance as they  arre sensitive to the scale of the data. Scaling will be done for each classification task and both types of scaling will be done on the data seperately so that the two methods can be compared later.

### Reduce Variables

##### Certain variables were deemed unnecessary during lab 1 and were removed during the course of this labs notebook.

These variables were removed:

Url:        
    Dropped as it was better served being split into multiple variables.            
    These varriables did end up not being useful or useable, however.  

n_tokens_content:           
    was removed after being log transformed.            
    This helped with the very skewed data.

n_non_stop_words:              
    Deemed unhelpful

n_non_stop_unique_tokens:           
    Deemed unhelpful

num_hrefs:          
    was removed after being log transformed.            
    This helped with the very skewed data.

num_self_hrefs:         
    was removed after being log transformed.            
    This helped with the very skewed data.
    
num_imgs:           
    was removed after being log transformed.            
    This helped with the very skewed data.

num_videos:         
    was removed after being log transformed.            
    This helped with the very skewed data.

kw_max_min:         
    was removed after being log transformed.            
    This helped with the very skewed data.

kw_min_max:         
    was removed after being log transformed.            
    This helped with the very skewed data.

self_reference_min_shares:          
    was removed after being log transformed.            
    This helped with the very skewed data.

self_reference_max_shares:          
    was removed after being log transformed.            
    This helped with the very skewed data.

self_reference_avg_sharess:             
    was removed after being log transformed.            
    This helped with the very skewed data.

weekday_is_X where X is the day of the week:        
    Removed previously.         
    This was to turn it into a categorical variable for the model being run at the time.        
    This variable has been recreated for our current classification problems as shown above with the factoring of day_of_week.      

data_channel_is_X where X is the type of data channel:      
    Removed previously.     
    This was to turn it into a categorical variable for the model being run at the time.        
    This variable has been recreated for our current classification problems as shown above with the factoring of news_category.        

In [5]:
# from previously cleaned dataset

# drop certain columns
# Done above
# df1 = df.drop('url_name', axis=1) # was a string
# df1 = df1.drop('date', axis=1) # datetime change didn't work.
# df1 = df1.drop('log_shares', axis=1) # not useful

# drop Na's
df1.dropna()

df1.head()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,day_of_week,news_category,year,month,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess,share_quantile_ranges
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,Monday,Entertainment,2013,1,5.393628,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859,<Q1
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022265,0.022446,0.022276,0.251465,0.681548,0.381987,0.152189,0.038462,0.007692,0.833333,0.166667,0.353939,0.033333,0.7,-0.4,-0.4,-0.4,0.25,0.2,0.25,0.2,1300,Monday,Tech,2013,1,4.875197,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888,Q1-Q2
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028575,0.199626,0.028615,0.714611,0.028572,0.54258,0.12237,0.063291,0.025316,0.714286,0.285714,0.357269,0.05,0.6,-0.338889,-1.0,-0.05,0.65,-0.5,0.15,0.5,1100,Monday,Uncategorized,2013,1,6.163315,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Q1-Q2
3,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020011,0.020317,0.117255,0.020007,0.82241,0.425089,0.128515,0.03964,0.012613,0.758621,0.241379,0.337965,0.05,0.7,-0.225794,-0.4,-0.125,0.5,-0.1,0.0,0.1,1600,Monday,Tech,2013,1,6.320768,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135,Q2-Q3
4,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025001,0.327017,0.025001,0.025001,0.597981,0.50652,0.279769,0.071749,0.013453,0.842105,0.157895,0.417055,0.1,1.0,-0.212354,-0.5,-0.05,0.333333,0.25,0.166667,0.25,2400,Monday,Tech,2013,1,7.017506,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199,Q2-Q3


url_name:           
    Dropped due to it being a string variable that wasn't useful

date:
    Dropped due to it being a datetime variable that wasn't useful

log_shares:
    Dropped due to it being a datetime variable that wasn't useful.             
    The share_quantile_ranges was deemed to be more useful.

All Na's were dropped (how many?)

The shares, day_of_week, and news_category variables will be removed when selecting the target variable further down. Variables related to the current task will also be removed such as 'shares', 'day_of_week_Monday' (for example), and news_category_World (for example). This is to avoid these variables essentially giving the model the right answer

##### Feature Selection

In [None]:


# Load the data
X = df1.drop(['share_quantile_ranges', 'shares', 'day_of_week', 'news_category'], axis=1) 
y = df1['share_quantile_ranges']

# Initialize the sequential feature selector
sfs = SequentialFeatureSelector(estimator=LogisticRegression(), scoring='f1_macro', n_jobs=-1)

# Fit the selector to the data
sfs.fit(X, y)

# Get the selected features
selected_features = sfs.get_support(indices=True)

# Get the column names of the selected features
selected_feature_names = X.columns[selected_features]

# Print the column names of the selected features
print(selected_feature_names)

##### Using feature selection with a LogisticRegression estimator several variables were able to be removed. The following variables were deemed unimportant to the model:

    'timedelta'
    'n_tokens_title'
    'average_token_length'   
    'num_keywords'
    'kw_min_min'
    'kw_avg_min'
    'kw_max_max'
    'kw_avg_max' 
    'kw_min_avg'
    'kw_max_avg'
    'LDA_00'
    'LDA_04'
    'global_sentiment_polarity'
    'rate_negative_words'
    'title_subjectivity'
    'shares'
    'month'
    'log_n_tokens_content'
    'log_num_hrefs'  
    'log_num_self_hrefs'
    'log_num_imgs'
    'log_num_videos'
    'log_kw_max_min'
    'log_self_reference_min_shares'

##### Using feature selection with a LogisticRegression estimator the following variables were deemed most important to the model:
    'n_unique_tokens'
    'is_weekend'
    'LDA_01'
    'LDA_02'
    'LDA_03'
    'global_subjectivity'
    'global_rate_positive_words'
    'global_rate_negative_words'
    'rate_positive_words'
    'avg_positive_polarity
    'min_positive_polarity'
    'max_positive_polarity'
    'avg_negative_polarity'
    'min_negative_polarity'
    'max_negative_polarity'
    'title_sentiment_polarity'
    'abs_title_subjectivity'
    'abs_title_sentiment_polarity
    'year'
    'log_kw_min_max'
    'log_kw_avg_avg'
    'log_self_reference_max_shares'
    'log_self_reference_avg_sharess

In [None]:
# Load the data
X = df1.drop(['share_quantile_ranges', 'shares', 'day_of_week', 'news_category'], axis=1) 
y = df1['share_quantile_ranges']

# Initialize the sequential feature selector
sfs = SequentialFeatureSelector(estimator=RandomForestClassifier(), scoring='f1_macro', n_jobs=-1)

# Fit the selector to the data
sfs.fit(X, y)

# Get the selected features
selected_features = sfs.get_support(indices=True)

# Get the column names of the selected features
selected_feature_names = X.columns[selected_features]

# Print the column names of the selected features
print(selected_feature_names)

##### Using feature selection with a RandomForest estimator several variables were able to be removed. The following variables were deemed unimportant to the model:

    'timedelta'
    'n_tokens_title'
    'average_token_length'
    'kw_min_min'
    'kw_max_max'
    'kw_avg_max'
    'kw_max_avg'
    'LDA_01'
    'LDA_02'
    'global_rate_negative_words'
    'rate_positive_words'
    'rate_negative_words'
    'min_positive_polarity'
    'avg_negative_polarity'
    'min_negative_polarity'
    'max_negative_polarity'
    'title_sentiment_polarity'
    'abs_title_subjectivity'
    'shares'
    'year'
    'month'
    'log_num_videos'
    'log_kw_min_max'
    'log_self_reference_max_shares'

##### Using feature selection with a RandomForest estimator the following variables were deemed most important to the model:
    'n_unique_tokens'
    'num_keywords'
    'kw_avg_min'
    'kw_min_avg'
    'is_weekend'
    'LDA_00'
    'LDA_03'
    'LDA_04'
    'global_subjectivity'
    'global_sentiment_polarity'
    'global_rate_positive_words'
    'avg_positive_polarity'
    'max_positive_polarity'
    'title_subjectivity'
    'abs_title_sentiment_polarity'
    'log_n_tokens_content'
    'log_num_hrefs'
    'log_num_self_hrefs'
    'log_num_imgs'
    'log_kw_max_min'
    'log_kw_avg_avg'
    'log_self_reference_min_shares'
    'log_self_reference_avg_sharess'

In [None]:
# Load the data
X = df1.drop(['share_quantile_ranges', 'shares', 'day_of_week', 'news_category'], axis=1) 
y = df1['share_quantile_ranges']

# Initialize the sequential feature selector
sfs = SequentialFeatureSelector(estimator=KNeighborsClassifier(), scoring='f1_macro', n_jobs=-1)

# Fit the selector to the data
sfs.fit(X, y)

# Get the selected features
selected_features = sfs.get_support(indices=True)

# Get the column names of the selected features
selected_feature_names = X.columns[selected_features]

# Print the column names of the selected features
print(selected_feature_names)

Using feature selection with a KNN estimator several variables were able to be removed. The following variables were deemed unimportant to the model:

    'timedelta'
    'n_tokens_title'
    'average_token_length'
    'num_keywords'
    'kw_min_min'
    'kw_avg_min'
    'kw_max_max'
    'kw_avg_max'
    'kw_min_avg'
    'kw_max_avg'
    'max_positive_polarity'
    'avg_negative_polarity'
    'min_negative_polarity'
    'title_subjectivity'
    'title_sentiment_polarity'
    'abs_title_subjectivity'
    'shares'
    'year'
    'month'
    'log_num_self_hrefs'
    'log_num_imgs'
    'log_kw_max_min'
    'log_kw_min_max'
    'log_self_reference_min_shares'

##### Using feature selection with a KNN estimator the following variables were deemed most important to the model:
    'n_unique_tokens'
    'is_weekend'
    'LDA_00'
    'LDA_01'
    'LDA_02'
    'LDA_03'
    'LDA_04'
    'global_subjectivity'
    'global_sentiment_polarity',
    'global_rate_positive_words'
    'global_rate_negative_words'
    'rate_positive_words'
    'rate_negative_words'
    'avg_positive_polarity'
    'min_positive_polarity'
    'max_negative_polarity'
    'abs_title_sentiment_polarity'
    'log_n_tokens_content'
    'log_num_hrefs'
    'log_num_videos'
    'log_kw_avg_avg'
    'log_self_reference_max_shares'
    'log_self_reference_avg_sharess'

In [None]:
# Load the data
X = df1.drop(['share_quantile_ranges', 'shares', 'day_of_week', 'news_category'], axis=1) 
y = df1['share_quantile_ranges']

# Initialize the sequential feature selector
sfs = SequentialFeatureSelector(estimator=SVC(C=1, kernel="linear"), scoring='f1_macro', n_jobs=-1)

# Fit the selector to the data
sfs.fit(X, y)

# Get the selected features
selected_features = sfs.get_support(indices=True)

# Get the column names of the selected features
selected_feature_names = X.columns[selected_features]

# Print the column names of the selected features
print(selected_feature_names)

##### Using feature selection with a SVM estimator several variables were able to be removed. The following variables were deemed unimportant to the model:

    'timedelta'
    'n_tokens_title'
    'n_unique_tokens'
    'average_token_length'
    'num_keywords'
    'kw_min_min'
    'kw_avg_min'
    'kw_max_max'
    'kw_avg_max'
    'kw_min_avg'
    'kw_max_avg'
    'is_weekend'
    'LDA_00'
    'LDA_01'
    'LDA_02'
    'LDA_03'
    'LDA_04'
    'global_subjectivity'
    'global_sentiment_polarity'
    'global_rate_positive_words'
    'global_rate_negative_words'
    'rate_positive_words'
    'rate_negative_words'
    'avg_positive_polarity'
    'min_positive_polarity'
    'max_positive_polarity'
    'avg_negative_polarity'
    'min_negative_polarity'
    'max_negative_polarity'
    'title_subjectivity'
    'title_sentiment_polarity'
    'abs_title_subjectivity'
    'abs_title_sentiment_polarity'
    'shares'
    'year'
    'month'
    'log_n_tokens_content'
    'log_num_hrefs'
    'log_num_self_hrefs'
    'log_num_imgs'
    'log_num_videos'
    'log_kw_max_min'
    'log_kw_min_max'
    'log_kw_avg_avg'
    'log_self_reference_min_shares'
    'log_self_reference_max_shares'
    'log_self_reference_avg_sharess'


##### Using feature selection with a SVM estimator the following variables were deemed most important to the model:
After 20 hours the feature selection had still not returned any results. Due to this, we will be using a mix of previous selected features to improve the model.

_________________
<a href="#top">Back to Top</a>
<a id="DataPrep2"></a>
# Data Preperation Part 2 [5 points]

## Describe the final dataset that is used for classification/regression
(include a description of any newly formed variables you created).

### share_quantile_ranges

In [6]:
# # Create bins using quantiles
# q1 = df1['shares'].quantile(0.25)
# q2 = df1['shares'].quantile(0.5)
# q3 = df1['shares'].quantile(0.75)

# # Define the bin labels
# labels = ['<Q1', 'Q1-Q2', 'Q2-Q3', '>Q3']

# # Cut the shares column into bins
# df1['share_quantile_ranges'] = pd.cut(df1['shares'], bins=[0, q1, q2, q3, 1000000], labels=labels)

# Print the value counts of the share_ranges_quantile column
print(df1['share_quantile_ranges'].value_counts())

Q1-Q2    10152
Q2-Q3     9932
<Q1       9930
>Q3       9630
Name: share_quantile_ranges, dtype: int64


share_quantile_ranges: This was created by splitting up the shares variable into bins by using its different quantile values. This allowed for a more even split among the groups.

### factoring day_of_week

In [None]:
# # Factor the `day_of_week` column.
# df1 = pd.get_dummies(df1, columns=['day_of_week'])

factored day_of_week columns: day_of_week was factored back into 7 columns of 0's and 1's similar to how it was in the original dataset.

### factoring news_category

In [None]:
# # Factor the `news_category` column.
# df1 = pd.get_dummies(df1, columns=['news_category'])

factored news_category columns: day_of_week was factored back into 6 columns of 0's and 1's representative if it is that day or not similar to how it was in the original dataset.

In [7]:
df1.describe()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,year,month,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
count,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0,39644.0
mean,354.530471,10.398749,0.548216,4.548239,7.223767,26.106801,312.366967,752324.066694,259281.938083,1117.14661,5657.211151,0.130915,0.184599,0.141256,0.216321,0.22377,0.234029,0.44337,0.119309,0.039625,0.016612,0.68215,0.287934,0.353825,0.095446,0.756728,-0.259524,-0.521944,-0.1075,0.282353,0.071425,0.341843,0.156064,3395.380184,2013.540939,6.615856,5.889971,2.156564,1.208878,1.116427,0.40042,6.393888,5.045209,7.976327,6.195185,6.917477,6.667697
std,214.163767,2.114037,3.520708,0.844406,1.90913,69.633215,620.783887,214502.129573,135102.247285,1137.456951,6098.871957,0.337312,0.262975,0.219707,0.282145,0.295191,0.289183,0.116685,0.096931,0.017429,0.010828,0.190206,0.156156,0.104542,0.071315,0.247786,0.127726,0.29029,0.095373,0.324247,0.26545,0.188791,0.226294,11626.950749,0.498327,3.390683,1.255442,0.809445,0.692698,0.973755,0.680486,1.311168,4.521016,0.489467,3.076913,3.43243,3.280186
min,8.0,2.0,0.0,0.0,1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.39375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,1.0,2013.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,164.0,9.0,0.47087,4.478404,6.0,-1.0,141.75,843300.0,172846.875,0.0,3562.101631,0.0,0.025051,0.025012,0.028571,0.028571,0.028574,0.396167,0.057757,0.028384,0.009615,0.6,0.185185,0.306244,0.05,0.6,-0.328383,-0.7,-0.125,0.0,0.0,0.166667,0.0,946.0,2013.0,4.0,5.509388,1.609438,0.693147,0.693147,0.0,6.100319,0.0,7.776304,6.461468,7.003974,6.889782
50%,339.0,10.0,0.539226,4.664082,7.0,-1.0,235.5,843300.0,244572.222223,1023.635611,4355.688836,0.0,0.033387,0.033345,0.040004,0.040001,0.040727,0.453457,0.119117,0.039023,0.015337,0.710526,0.28,0.358755,0.1,0.8,-0.253333,-0.5,-0.1,0.15,0.0,0.5,0.0,1400.0,2014.0,7.0,6.016157,2.197225,1.386294,0.693147,0.0,6.493754,7.244942,7.962442,7.09091,7.937732,7.696667
75%,542.0,12.0,0.608696,4.854839,9.0,4.0,357.0,843300.0,330980.0,2056.781032,6019.953968,0.0,0.240958,0.150831,0.334218,0.375763,0.399986,0.508333,0.177832,0.050279,0.021739,0.8,0.384615,0.411428,0.1,1.0,-0.186905,-0.3,-0.05,0.5,0.15,0.5,0.25,2800.0,2014.0,10.0,6.575076,2.70805,1.609438,1.609438,0.693147,6.908755,8.974745,8.189031,7.863651,8.987322,8.556606
max,731.0,23.0,701.0,8.041534,10.0,377.0,42827.857143,843300.0,843300.0,3613.039819,298400.0,1.0,0.926994,0.925947,0.919999,0.926534,0.927191,1.0,0.727841,0.155488,0.184932,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.5,1.0,843300.0,2014.0,12.0,9.044876,5.720312,4.762174,4.859812,4.521789,12.606193,13.645079,10.682093,13.645079,13.645079,13.645079


________________
<a href="#top">Back to Top</a>
<a id="ModelEval1"></a>
# Modeling and Evaluation 1 (10 points total)

## Evaluation Metrics

F1 score was chosen as our most important evaluation metric for analyzing the results of the modeling.

F1 Score balances both precision and recall on the positive class which is ideal for this type of classification problem that predicts popularity based off shares, day of week, or news category as it is easy to interpret and communicate to our stakeholder Mashable. 

Accuracy is also tracked as it does provide correctly classified observations and it is always important to note but it is not the most significant metric in our final analysis of the models.

Overall F1 the best metric to avoid overfitting issues due to any imbalances in the dataset.

________________
<a href="#top">Back to Top</a>
<a id="ModelEval2"></a>
# Modeling and Evaluation 2 (10 points total)

## Dividing your data into training and testing splits

In [8]:

random_state = 42

# Create a StratifiedKFold object with n_splits=10
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state = random_state)

## Tasks

share_quantile_ranges task

In [12]:
# Remove target, related variables, and factor other targets due to them being categorical
X0 = pd.get_dummies(df1, columns=['day_of_week'])
X0 = pd.get_dummies(X0, columns=['news_category'])
X1 = X0.drop(['share_quantile_ranges', 'shares'], axis=1) 
y1 = X0['share_quantile_ranges']
print(X1.columns)
print(y1)

Index(['timedelta', 'n_tokens_title', 'n_unique_tokens',
       'average_token_length', 'num_keywords', 'kw_min_min', 'kw_avg_min',
       'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'is_weekend',
       'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negative_words', 'rate_positive_words',
       'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
       'max_positive_polarity', 'avg_negative_polarity',
       'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
       'title_sentiment_polarity', 'abs_title_subjectivity',
       'abs_title_sentiment_polarity', 'year', 'month', 'log_n_tokens_content',
       'log_num_hrefs', 'log_num_self_hrefs', 'log_num_imgs', 'log_num_videos',
       'log_kw_max_min', 'log_kw_min_max', 'log_kw_avg_avg',
       'log_self_reference_min_shares', 'log_self_reference_max_shares',
       'log_se

In [13]:
# Split the data into training and testing folds
X_train1, X_test1, y_train1, y_test1 = [], [], [], []
for train_index, test_index in kfold.split(X1, y1):
    X_train1.append(X1.iloc[train_index])  # Use iloc to index the DataFrame
    X_test1.append(X1.iloc[test_index])
    y_train1.append(y1.iloc[train_index])
    y_test1.append(y1.iloc[test_index])

In [None]:
# Check if the X1 DataFrame contains the columns that are being indexed
missing_columns = set(train_index) - set(X1.columns)
if missing_columns:
    raise KeyError(f"The following columns are not in the X1 DataFrame: {missing_columns}")


day_of_week task

In [None]:
# Remove target, related variables, and factor other targets due to them being categorical
X0 = pd.get_dummies(df1, columns=['news_category'])
X2 = X0.drop(['share_quantile_ranges', 'day_of_week'], axis=1) # Removing target categorical variable and other categorical variables
y2 = X0['day_of_week']
# print(X2.columns)
# print(y2)

In [None]:
# Split the data into training and testing folds
X_train2, X_test2, y_train2, y_test2 = [], [], [], []
for train_index, test_index in kfold.split(X2, y2):
    X_train2.append(X2[train_index])
    X_test2.append(X2[test_index])
    y_train2.append(y2[train_index])
    y_test2.append(y2[test_index])

news_category task

In [24]:
# Splitting the Data for news_category task and other targets due to them being categorical
X0 = pd.get_dummies(df1, columns=['day_of_week'])
X3 = X0.drop(['share_quantile_ranges', 'news_category'], axis=1) # Removing target categorical variable and other categorical variables
y3 = X0['news_category']
# print(X2.columns)
# print(y2)

In [None]:
# Split the data into training and testing folds
X_train3, X_test3, y_train3, y_test3 = [], [], [], []
for train_index, test_index in kfold.split(X3, y3):
    X_train3.append(X3[train_index])
    X_test3.append(X3[test_index])
    y_train3.append(y3[train_index])
    y_test3.append(y3[test_index])

We are using 10-fold cross validation for our training/testing split.
 
    It ensures that the training and testing splits are representative of the overall dataset. This is important because it helps to avoid overfitting the model to the training data.
    It reduces the risk of variance. This is because the model is trained and evaluated on 10 different folds of the data, rather than just a single split.
    It is relatively easy to implement. There are many libraries and frameworks that provide support for stratified cross-validation.


    It can be used to compare different machine learning models.
    It can be used to identify the best hyperparameters for your model.
    It can be used to estimate the confidence interval for your model's performance.


________________
<a href="#top">Back to Top</a>
<a id="ModelEval3"></a>
# Modeling and Evaluation 3 (20 points total)

## Scaling the Dataset

In [14]:
# Scale the features in the training and testing sets using StandardScalar.
scaler = StandardScaler()

# Use previous train/test split

# for share_quantile_range_task
X_train1 = scaler.fit_transform(X_train1)
X_test1 = scaler.transform(X_test1)

# for day_of_week task
X_train2 = scaler.fit_transform(X_train2)
X_test2 = scaler.transform(X_test2)

# for news_category task
X_train3 = scaler.fit_transform(X_train3)
X_test3 = scaler.transform(X_test3)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [15]:
# Scale the features in the training and testing sets using QuantileTransformer after using StandardScalar.
quantile_transformer = QuantileTransformer(n_quantiles=100)

# Use previous train/test split

# for share_quantile_range_task
X_train_q1 = quantile_transformer.fit_transform(X_train1)
X_test_q1 = quantile_transformer.fit_transform(X_test1)

# for day_of_week task
X_train_q2 = quantile_transformer.fit_transform(X_train2)
X_test_q2 = quantile_transformer.fit_transform(X_test2)

# for news_category task
X_train_q3 = quantile_transformer.fit_transform(X_train3)
X_test_q3 = quantile_transformer.fit_transform(X_test3)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

________________
<a href="#top">Back to Top</a>
## Create three different classification/regression models
<a id="RFmodel"></a>
#### Random Forest

In [16]:
# Create the random forest model
rf = RandomForestClassifier(n_estimators=100)

##### share_quantile_ranges task

In [18]:
# Train the share_quantile_ranges model
rf.fit(X_train_q1, y_train1)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [None]:
# Make predictions on the test set for share_quantile_ranges task
rf_y_pred_q1 = rf.predict(X_test_q1)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the share_quantile_ranges task.
rf_accuracy_q1 = accuracy_score(y_test1, rf_y_pred_q1)
rf_precision_q1 = precision_score(y_test1, rf_y_pred_q1, average='macro')
rf_recall_q1 = recall_score(y_test1, rf_y_pred_q1, average='macro')
rf_f1_score_q1 = f1_score(y_test1, rf_y_pred_q1, average='macro')

# Create an array to store the model evaluation metrics
rf_model_scores_q1 = np.array([rf_accuracy_q1, rf_precision_q1, rf_recall_q1, rf_f1_score_q1])

# Print the model evaluation metrics
print('Random Forest on share_quantile_ranges Task')
print(rf_model_scores_q1)
print('---------------------------------------------------------')
print('Random Forest accuracy:', rf_accuracy_q1)
print('Random Forest precision:', rf_precision_q1)
print('Random Forest recall:', rf_recall_q1)
print('Random Forest F1 score:', rf_f1_score_q1)

### Adjust parameters as appropriate to increase generalization performance using your chosen metric.

##### day_of_week task

In [None]:
# Train the day_of_week task
rf.fit(X_train_q2, y_train2)

In [None]:
# Make predictions on the test set for day_of_week task
rf_y_pred_q2 = rf.predict(X_test_q2)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for day_of_week task.
rf_accuracy_q2 = accuracy_score(y_test2, rf_y_pred_q2)
rf_precision_q2 = precision_score(y_test2, rf_y_pred_q2, average='macro')
rf_recall_q2 = recall_score(y_test2, rf_y_pred_q2, average='macro')
rf_f1_score_q2 = f1_score(y_test2, rf_y_pred_q2, average='macro')

# Create an array to store the model evaluation metrics
rf_model_scores_q2 = np.array([rf_accuracy_q2, rf_precision_q2, rf_recall_q2, rf_f1_score_q2])

# Print the model evaluation metrics
print('Random Forest on day_of_week Task')
print(rf_model_scores_q2)
print('---------------------------------------------------------')
print('Random Forest accuracy:', rf_accuracy_q2)
print('Random Forest precision:', rf_precision_q2)
print('Random Forest recall:', rf_recall_q2)
print('Random Forest F1 score:', rf_f1_score_q2)

### Adjust parameters as appropriate to increase generalization performance using your chosen metric.

________________
<a href="#top">Back to Top</a>
<a id="KNNmodel"></a>
#### KNN

##### share_quantile_ranges task

In [None]:
# Create the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
# Train the model for the share_quantile_ranges task.
knn.fit(X_train_q1, y_train1)

In [None]:
# Make predictions on the test set for the share_quantile_ranges task.
knn_y_pred_q1 = knn.predict(X_test_q1)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the share_quantile_ranges task.
knn_accuracy_q1 = accuracy_score(y_test1, knn_y_pred_q1)
knn_precision_q1 = precision_score(y_test1, knn_y_pred_q1, average='macro')
knn_recall_q1 = recall_score(y_test1, knn_y_pred_q1, average='macro')
knn_f1_score_q1 = f1_score(y_test1, knn_y_pred_q1, average='macro')

# Create an array to store the model evaluation metrics for the share_quantile_ranges task.
knn_model_scores_q1 = np.array([knn_accuracy_q1, knn_precision_q1, knn_recall_q1, knn_f1_score_q1])
print('KNN on share_quantile_ranges Task')
print(knn_model_scores_q1)
print('---------------------------------------------------------')
print('KNN accuracy:', knn_accuracy_q1)
print('KNN precision:', knn_precision_q1)
print('KNN recall:', knn_recall_q1)
print('KNN F1 score:', knn_f1_score_q1)

### Adjust parameters as appropriate to increase generalization performance using your chosen metric.

##### day_of_week task

In [None]:
# Train the model for the day_of_week task.
knn.fit(X_train_q2, y_train2)

In [None]:
# Make predictions on the test set for the day_of_week task.
knn_y_pred_q2 = knn.predict(X_test_q2)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the day_of_week task.
knn_accuracy_q2 = accuracy_score(y_test2, knn_y_pred_q2)
knn_precision_q2 = precision_score(y_test2, knn_y_pred_q2, average='macro')
knn_recall_q2 = recall_score(y_test2, knn_y_pred_q2, average='macro')
knn_f1_score_q2 = f1_score(y_test2, knn_y_pred_q2, average='macro')

# Create an array to store the model evaluation metrics the day_of_week task.
knn_model_scores_q2 = np.array([knn_accuracy_q2, knn_precision_q2, knn_recall_q2, knn_f1_score_q2])
print('KNN on day_of_week Task')
print(knn_model_scores_q2)
print('---------------------------------------------------------')
print('KNN accuracy:', knn_accuracy_q2)
print('KNN precision:', knn_precision_q2)
print('KNN recall:', knn_recall_q2)
print('KNN F1 score:', knn_f1_score_q2)

### Adjust parameters as appropriate to increase generalization performance using your chosen metric.

________________
<a href="#top">Back to Top</a>
<a id="SVMmodel"></a>
#### SVM

In [None]:
# Set the model parameters
C = 1.0
kernel = 'linear'

# Create the support vector machine model
support_vector_machine_model = SVC(C=C, kernel=kernel)

##### share_quantile_ranges task

In [None]:
# Train the support vector machine model on the training set using a linear kernel for the share_quantile_ranges task.
support_vector_machine_model.fit(X_train_q1, y_train1)

In [None]:
# Make predictions on the test data for the share_quantile_ranges task.
svm_y_pred_q1 = support_vector_machine_model.predict(X_test_q1)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the share_quantile_ranges task.
svm_accuracy_q1 = accuracy_score(y_test1, svm_y_pred_q1)
svm_precision_q1 = precision_score(y_test1, svm_y_pred_q1, average='macro')
svm_recall_q1 = recall_score(y_test1, svm_y_pred_q1, average='macro')
svm_f1_score_q1 = f1_score(y_test1, svm_y_pred_q1, average='macro')

# Create an array to store the model evaluation metrics for the share_quantile_ranges task.
svm_model_scores_q1 = np.array([svm_accuracy_q1, svm_precision_q1, svm_recall_q1, svm_f1_score_q1])
print('SVM on share_quantile_range Task')
print(svm_model_scores_q1)
print('---------------------------------------------------------')
print('Support vector machine accuracy:', svm_accuracy_q1)
print('Support vector machine precision:', svm_precision_q1)
print('Support vector machine recall:', svm_recall_q1)
print('Support vector machine F1 score:', svm_f1_score_q1)

### Adjust parameters as appropriate to increase generalization performance using your chosen metric.

##### day_of_week task

In [None]:
# Train the support vector machine model on the training set using a linear kernel for the day_of_week task.
support_vector_machine_model.fit(X_train_q2, y_train2)

In [None]:
# Make predictions on the test data for the day_of_week task.
svm_y_pred_q2 = support_vector_machine_model.predict(X_test_q2)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the day_of_week task.
svm_accuracy_q2 = accuracy_score(y_test2, svm_y_pred_q2)
svm_precision_q2 = precision_score(y_test2, svm_y_pred_q2, average='macro')
svm_recall_q2 = recall_score(y_test2, svm_y_pred_q2, average='macro')
svm_f1_score_q2 = f1_score(y_test2, svm_y_pred_q2, average='macro')

# Create an array to store the model evaluation metrics for the day_of_week task
svm_model_scores_q2 = np.array([svm_accuracy_q2, svm_precision_q2, svm_recall_q2, svm_f1_score_q2])
print('SVM on day_of_week Task')
print(svm_model_scores_q2)
print('---------------------------------------------------------')
print('Support vector machine accuracy:', svm_accuracy_q2)
print('Support vector machine precision:', svm_precision_q2)
print('Support vector machine recall:', svm_recall_q2)
print('Support vector machine F1 score:', svm_f1_score_q2)

### Adjust parameters as appropriate to increase generalization performance using your chosen metric.

________________
<a href="#top">Back to Top</a>
<a id="ModelEval4"></a>
# Modeling and Evaluation 4 (10 points total)

## Analyze the results using your chosen method of evaluation.

### Use visualizations of the results to bolster the analysis.


### Explain any visuals and analyze why they are interesting to someone that might use this model.

<a href="#top">Back to Top</a>
<a id="ModelEval5"></a>
# Modeling and Evaluation 5 (10 points total)

## Discuss the advantages of each model for each classification task, if any.

<a href="#top">Back to Top</a>
<a id="TaskEval"></a>
### Is the difference significant with 95% confidence? Use proper statistical comparison methods.

<a id="sqrTask"></a>
#### share_quantile_ranges task

In [None]:
# QuantileTransformer transformed accuracy scores
print(rf_model_scores_q1)
print(knn_model_scores_q1)
print(svm_model_scores_q1)

In [None]:
# Load the model accuracy scores
model1_scores = rf_model_scores_q1
model2_scores = knn_model_scores_q1
model3_scores = svm_model_scores_q1

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model2_scores)

print("Random Forest vs. KNN")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value2 = ttest_rel(model1_scores, model3_scores)

print("Random Forest vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value3 = ttest_rel(model2_scores, model3_scores)

print("KNN vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

<a href="#top">Back to Top</a>
<a id="dowTask"></a>
#### day_of_week task

In [None]:
# QuantileTransformer transformed accuracy scores
print(rf_model_scores_q2)
print(knn_model_scores_q2)
print(svm_model_scores_q2)

In [None]:
# Load the model accuracy scores
model1_scores = rf_model_scores_q2
model2_scores = knn_model_scores_q2
model3_scores = svm_model_scores_q2

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model2_scores)

print("Random Forest vs. KNN")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value2 = ttest_rel(model1_scores, model3_scores)

print("Random Forest vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value3 = ttest_rel(model2_scores, model3_scores)

print("KNN vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

________________
<a href="#top">Back to Top</a>
<a id="ModelEval6"></a>
# Modeling and Evaluation 6 (10 points total)

## Which attributes from your analysis are most important?

### Use proper methods discussed in class to evaluate the importance of different attributes.


### Discuss the results and hypothesize about why certain attributes are more important than others for a given classification task.

________________
<a href="#top">Back to Top</a>
<a id="Deployment"></a>
# Deployment (5 points total)

## How useful is your model for interested parties (i.e., the companies or organizations that might want to use it for prediction)?

### How would you measure the model's value if it was used by these parties?


### How would your deploy your model for interested parties?


### What other data should be collected?


### How often would the model need to be updated, etc.?

________________
<a href="#top">Back to Top</a>
<a id="Exceptional"></a>
# Exceptional Work (10 points total)

## Additional modeling.
The StandardScalar transformed data was taken down the same path as the QuantileTransformer transformed data. There was a seperate running of each task; one using each scale transformer. This was done in order to compare the two and determine which, if either, is better than the other.

An additional task was performed using the three models. The 'news_category' task is focused on determining category of news that the article falls under. The SVM model did not take to this task very well. The SVM model runtime is much too long for it to be useful, especially compared to the runtime of the other models.

##### StandardScalar Transformed on Random Forest model for each task

##### share_quantile_ranges task

In [None]:
# Train the share_quantile_ranges model
rf.fit(X_train1, y_train1)

In [None]:
# Make predictions on the test set for share_quantile_ranges task
rf_y_pred1 = rf.predict(X_test1)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the share_quantile_ranges task.
rf_accuracy1 = accuracy_score(y_test1, rf_y_pred1)
rf_precision1 = precision_score(y_test1, rf_y_pred1, average='macro')
rf_recall1 = recall_score(y_test1, rf_y_pred1, average='macro')
rf_f1_score1 = f1_score(y_test1, rf_y_pred1, average='macro')

# Create an array to store the model evaluation metrics
rf_model_scores1 = np.array([rf_accuracy1, rf_precision1, rf_recall1, rf_f1_score1])

# Print the model evaluation metrics
print('Random Forest on share_quantile_ranges Task')
print(rf_model_scores1)
print('---------------------------------------------------------')
print('Random Forest accuracy:', rf_accuracy1)
print('Random Forest precision:', rf_precision1)
print('Random Forest recall:', rf_recall1)
print('Random Forest F1 score:', rf_f1_score1)

##### day_of_week task

In [None]:
# Train the day_of_week task
rf.fit(X_train2, y_train2)

In [None]:
# Make predictions on the test set for day_of_week task
rf_y_pred2 = rf.predict(X_test2)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for day_of_week task.
rf_accuracy2 = accuracy_score(y_test2, rf_y_pred2)
rf_precision2 = precision_score(y_test2, rf_y_pred2, average='macro')
rf_recall2 = recall_score(y_test2, rf_y_pred2, average='macro')
rf_f1_score2 = f1_score(y_test2, rf_y_pred2, average='macro')

# Create an array to store the model evaluation metrics
rf_model_scores2 = np.array([rf_accuracy2, rf_precision2, rf_recall2, rf_f1_score2])

# Print the model evaluation metrics
print('Random Forest on day_of_week Task')
print(rf_model_scores2)
print('---------------------------------------------------------')
print('Random Forest accuracy:', rf_accuracy2)
print('Random Forest precision:', rf_precision2)
print('Random Forest recall:', rf_recall2)
print('Random Forest F1 score:', rf_f1_score2)

##### news_category task

In [None]:
# Train the news_category task
rf.fit(X_train3, y_train3)

In [None]:
# Make predictions on the test set for news_category task
rf_y_pred3 = rf.predict(X_test3)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for news_category task.
rf_accuracy3 = accuracy_score(y_test3, rf_y_pred3)
rf_precision3 = precision_score(y_test3, rf_y_pred3, average='macro')
rf_recall3 = recall_score(y_test3, rf_y_pred3, average='macro')
rf_f1_score3 = f1_score(y_test3, rf_y_pred3, average='macro')

# Create an array to store the model evaluation metrics
rf_model_scores3 = np.array([rf_accuracy3, rf_precision3, rf_recall3, rf_f1_score3])

# Print the model evaluation metrics
print('Random Forest on news_category Task')
print(rf_model_scores3)
print('---------------------------------------------------------')
print('Random Forest accuracy:', rf_accuracy3)
print('Random Forest precision:', rf_precision3)
print('Random Forest recall:', rf_recall3)
print('Random Forest F1 score:', rf_f1_score3)

____
##### StandardScalar Transformed on KNN model for each task

##### share_quantile_ranges task

In [None]:
# Train the model for the share_quantile_ranges task.
knn.fit(X_train1, y_train1)

In [None]:
# Make predictions on the test set for the share_quantile_ranges task.
knn_y_pred1 = knn.predict(X_test1)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the share_quantile_ranges task.
knn_accuracy1 = accuracy_score(y_test1, knn_y_pred1)
knn_precision1 = precision_score(y_test1, knn_y_pred1, average='macro')
knn_recall1 = recall_score(y_test1, knn_y_pred1, average='macro')
knn_f1_score1 = f1_score(y_test1, knn_y_pred1, average='macro')

# Create an array to store the model evaluation metrics for the share_quantile_ranges task.
knn_model_scores1 = np.array([knn_accuracy1, knn_precision1, knn_recall1, knn_f1_score1])
print('KNN on share_quantile_ranges Task')
print(knn_model_scores1)
print('---------------------------------------------------------')
print('KNN accuracy:', knn_accuracy1)
print('KNN precision:', knn_precision1)
print('KNN recall:', knn_recall1)
print('KNN F1 score:', knn_f1_score1)

##### day_of_week task

In [None]:
# Train the model for the day_of_week task.
knn.fit(X_train2, y_train2)

In [None]:
# Make predictions on the test set for the day_of_week task.
knn_y_pred2 = knn.predict(X_test2)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the day_of_week task.
knn_accuracy2 = accuracy_score(y_test2, knn_y_pred2)
knn_precision2 = precision_score(y_test2, knn_y_pred2, average='macro')
knn_recall2 = recall_score(y_test2, knn_y_pred2, average='macro')
knn_f1_score2 = f1_score(y_test2, knn_y_pred2, average='macro')

# Create an array to store the model evaluation metrics the day_of_week task.
knn_model_scores2 = np.array([knn_accuracy2, knn_precision2, knn_recall2, knn_f1_score2])
print('KNN on day_of_week Task')
print(knn_model_scores2)
print('---------------------------------------------------------')
print('KNN accuracy:', knn_accuracy2)
print('KNN precision:', knn_precision2)
print('KNN recall:', knn_recall2)
print('KNN F1 score:', knn_f1_score2)

##### news_category task

In [None]:
# Train the model for the news_category task.
knn.fit(X_train3, y_train3)

In [None]:
# Make predictions on the test set for the news_category task.
knn_y_pred3 = knn.predict(X_test3)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the news_category task.
knn_accuracy3 = accuracy_score(y_test3, knn_y_pred3)
knn_precision3 = precision_score(y_test3, knn_y_pred3, average='macro')
knn_recall3 = recall_score(y_test3, knn_y_pred3, average='macro')
knn_f1_score3 = f1_score(y_test3, knn_y_pred3, average='macro')

# Create an array to store the model evaluation metrics for the news_category task.
knn_model_scores3 = np.array([knn_accuracy3, knn_precision3, knn_recall3, knn_f1_score3])
print('KNN on news_category Task')
print(knn_model_scores3)
print('---------------------------------------------------------')
print('KNN accuracy:', knn_accuracy3)
print('KNN precision:', knn_precision3)
print('KNN recall:', knn_recall3)
print('KNN F1 score:', knn_f1_score3)

____
##### StandardScalar Transformed on SVM model for each task

##### share_quantile_ranges task

In [None]:
# Train the support vector machine model on the training set using a linear kernel for the share_quantile_ranges task.
support_vector_machine_model.fit(X_train1, y_train1)

In [None]:
# Make predictions on the test data for the share_quantile_ranges task.
svm_y_pred1 = support_vector_machine_model.predict(X_test1)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the share_quantile_ranges task.
svm_accuracy1 = accuracy_score(y_test1, svm_y_pred1)
svm_precision1 = precision_score(y_test1, svm_y_pred1, average='macro')
svm_recall1 = recall_score(y_test1, svm_y_pred1, average='macro')
svm_f1_score1 = f1_score(y_test1, svm_y_pred1, average='macro')

# Create an array to store the model evaluation metrics for the share_quantile_ranges task.
svm_model_scores1 = np.array([svm_accuracy1, svm_precision1, svm_recall1, svm_f1_score1])
print('SVM on share_quantile_range Task')
print(svm_model_scores1)
print('---------------------------------------------------------')
print('Support vector machine accuracy:', svm_accuracy1)
print('Support vector machine precision:', svm_precision1)
print('Support vector machine recall:', svm_recall1)
print('Support vector machine F1 score:', svm_f1_score1)

##### day_of_week task

In [None]:
# Train the support vector machine model on the training set using a linear kernel for the day_of_week task.
support_vector_machine_model.fit(X_train2, y_train2)

In [None]:
# Make predictions on the test data for the day_of_week task.
svm_y_pred2 = support_vector_machine_model.predict(X_test2)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the day_of_week task.
svm_accuracy2 = accuracy_score(y_test2, svm_y_pred2)
svm_precision2 = precision_score(y_test2, svm_y_pred2, average='macro')
svm_recall2 = recall_score(y_test2, svm_y_pred2, average='macro')
svm_f1_score2 = f1_score(y_test2, svm_y_pred2, average='macro')

# Create an array to store the model evaluation metrics for the day_of_week task
svm_model_scores2 = np.array([svm_accuracy2, svm_precision2, svm_recall2, svm_f1_score2])
print('SVM on day_of_week Task')
print(svm_model_scores2)
print('---------------------------------------------------------')
print('Support vector machine accuracy:', svm_accuracy2)
print('Support vector machine precision:', svm_precision2)
print('Support vector machine recall:', svm_recall2)
print('Support vector machine F1 score:', svm_f1_score2)

news_category

In [None]:
# Train the support vector machine model on the training set using a linear kernel for the news_category task.
# support_vector_machine_model.fit(X_train3, y_train3)

In [None]:
# Make predictions on the test data for the news_category task.
# svm_y_pred3 = support_vector_machine_model.predict(X_test3)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the news_category task.
# svm_accuracy3 = accuracy_score(y_test3, svm_y_pred3)
# svm_precision3 = precision_score(y_test3, svm_y_pred3, average='macro')
# svm_recall3 = recall_score(y_test3, svm_y_pred3, average='macro')
# svm_f1_score3 = f1_score(y_test3, svm_y_pred3, average='macro')

# # Create an array to store the model evaluation metrics for the news_category task
# svm_model_scores3 = np.array([svm_accuracy3, svm_precision3, svm_recall3, svm_f1_score3])
# print('SVM on news_category Task')
# print(svm_model_scores3)
# print('---------------------------------------------------------')
# print('Support vector machine accuracy:', svm_accuracy3)
# print('Support vector machine precision:', svm_precision3)
# print('Support vector machine recall:', svm_recall3)
# print('Support vector machine F1 score:', svm_f1_score3)

___
<a href="#top">Back to Top</a>
<a id="#ncTask"></a>
##### QuantileTransformer transformed for each model on 'news_category' task 

RandomForest

In [None]:
# Train the news_category task
rf.fit(X_train_q3, y_train3)

In [None]:
# Make predictions on the test set for news_category task
rf_y_pred_q3 = rf.predict(X_test_q3)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for news_category task.
rf_accuracy_q3 = accuracy_score(y_test3, rf_y_pred_q3)
rf_precision_q3 = precision_score(y_test3, rf_y_pred_q3, average='macro')
rf_recall_q3 = recall_score(y_test3, rf_y_pred_q3, average='macro')
rf_f1_score_q3 = f1_score(y_test3, rf_y_pred_q3, average='macro')

# Create an array to store the model evaluation metrics
rf_model_scores_q3 = np.array([rf_accuracy_q3, rf_precision_q3, rf_recall_q3, rf_f1_score_q3])

# Print the model evaluation metrics
print('Random Forest on news_category Task')
print(rf_model_scores_q3)
print('---------------------------------------------------------')
print('Random Forest accuracy:', rf_accuracy_q3)
print('Random Forest precision:', rf_precision_q3)
print('Random Forest recall:', rf_recall_q3)
print('Random Forest F1 score:', rf_f1_score_q3)

KNN

In [None]:
# Train the model for the news_category task.
knn.fit(X_train_q3, y_train3)

In [None]:
# Make predictions on the test set for the news_category task.
knn_y_pred_q3 = knn.predict(X_test_q3)

In [None]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the news_category task.
knn_accuracy_q3 = accuracy_score(y_test3, knn_y_pred_q3)
knn_precision_q3 = precision_score(y_test3, knn_y_pred_q3, average='macro')
knn_recall_q3 = recall_score(y_test3, knn_y_pred_q3, average='macro')
knn_f1_score_q3 = f1_score(y_test3, knn_y_pred_q3, average='macro')

# Create an array to store the model evaluation metrics for the news_category task.
knn_model_scores_q3 = np.array([knn_accuracy_q3, knn_precision_q3, knn_recall_q3, knn_f1_score_q3])
print('KNN on news_category Task')
print(knn_model_scores_q3)
print('---------------------------------------------------------')
print('KNN accuracy:', knn_accuracy_q3)
print('KNN precision:', knn_precision_q3)
print('KNN recall:', knn_recall_q3)
print('KNN F1 score:', knn_f1_score_q3)

SVM
This model did not run in a timely manner and was dropped. As mentioned before the SVM models for the news_category tasks took markedly longer than their counterparts.

In [None]:
# # Train the support vector machine model on the training set using a linear kernel for the news_category task.
# support_vector_machine_model.fit(X_train_q3, y_train3)

In [None]:
# # Make predictions on the test data for the news_category task.
# svm_y_pred_q3 = support_vector_machine_model.predict(X_test_q3)

In [None]:
# # Calculate the accuracy, precision, recall, and F1 score of the model on the test data for the news_category task.
# svm_accuracy_q3 = accuracy_score(y_test3, svm_y_pred_q3)
# svm_precision_q3 = precision_score(y_test3, svm_y_pred_q3, average='macro')
# svm_recall_q3 = recall_score(y_test3, svm_y_pred_q3, average='macro')
# svm_f1_score_q3 = f1_score(y_test3, svm_y_pred_q3, average='macro')

# # Create an array to store the model evaluation metrics for the news_category task
# svm_model_scores_q3 = np.array([svm_accuracy_q3, svm_precision_q3, svm_recall_q3, svm_f1_score_q3])
# print('SVM on news_category Task')
# print(svm_model_scores_q3)
# print('---------------------------------------------------------')
# print('Support vector machine accuracy:', svm_accuracy_q3)
# print('Support vector machine precision:', svm_precision_q3)
# print('Support vector machine recall:', svm_recall_q3)
# print('Support vector machine F1 score:', svm_f1_score_q3)

______
Comparing the StandardScalar transformed models' performance on each task

#### share_quantile_ranges task

In [None]:
# StandardScalar transformed accuracy scores
print(rf_model_scores1)
print(knn_model_scores1)
print(svm_model_scores1)

In [None]:
# Load the model accuracy scores
model1_scores = rf_model_scores1
model2_scores = knn_model_scores1
model3_scores = svm_model_scores1

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model2_scores)

print("Random Forest vs. KNN")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value2 = ttest_rel(model1_scores, model3_scores)

print("Random Forest vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value3 = ttest_rel(model2_scores, model3_scores)

print("KNN vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

#### day_of_week task

In [None]:
# StandardScalar transformed accuracy scores
print(rf_model_scores2)
print(knn_model_scores2)
print(svm_model_scores2)

In [None]:
# Load the model accuracy scores
model1_scores = rf_model_scores2
model2_scores = knn_model_scores2
model3_scores = svm_model_scores2

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model2_scores)

print("Random Forest vs. KNN")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value2 = ttest_rel(model1_scores, model3_scores)

print("Random Forest vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value3 = ttest_rel(model2_scores, model3_scores)

print("KNN vs. SVM")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

#### news_category task

In [None]:
# StandardScalar transformed accuracy scores
print(rf_model_scores3)
print(knn_model_scores3)
# print(svm_model_scores_q2)

In [None]:
# Load the model accuracy scores
model1_scores = rf_model_scores3
model2_scores = knn_model_scores3
# model3_scores = svm_model_scores3

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model2_scores)

print("Random Forest vs. KNN")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# # Perform a paired t-test
# t_statistic, p_value2 = ttest_rel(model1_scores, model3_scores)

# print("Random Forest vs. SVM")
# # Check if the difference is significant with 95% confidence
# if p_value2 < 0.05:
#   print("The difference between the two models is statistically significant.")
# else:
#   print("The difference between the two models is not statistically significant.")

# print('---------------------------------------------------------')

# # Perform a paired t-test
# t_statistic, p_value3 = ttest_rel(model2_scores, model3_scores)

# print("KNN vs. SVM")
# # Check if the difference is significant with 95% confidence
# if p_value2 < 0.05:
#   print("The difference between the two models is statistically significant.")
# else:
#   print("The difference between the two models is not statistically significant.")

______
Comparing the removed news_category task QuantileTransformer Transformed models

In [None]:
# QuantileTransformer transformed accuracy scores
print(rf_model_scores_q3) # share_quantile_ranges task
print(knn_model_scores_q3) # day_of_week task
# print(svm_model_scores_q3) # news_category task

In [None]:
# Load the model accuracy scores
model1_scores = rf_model_scores_q3
model2_scores = knn_model_scores_q3
# model3_scores = svm_model_scores_q3

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model2_scores)

print("Random Forest vs. KNN")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# # Perform a paired t-test
# t_statistic, p_value2 = ttest_rel(model1_scores, model3_scores)

# print("Random Forest vs. SVM")
# # Check if the difference is significant with 95% confidence
# if p_value2 < 0.05:
#   print("The difference between the two models is statistically significant.")
# else:
#   print("The difference between the two models is not statistically significant.")

# print('---------------------------------------------------------')

# # Perform a paired t-test
# t_statistic, p_value3 = ttest_rel(model2_scores, model3_scores)

# print("KNN vs. SVM")
# # Check if the difference is significant with 95% confidence
# if p_value2 < 0.05:
#   print("The difference between the two models is statistically significant.")
# else:
#   print("The difference between the two models is not statistically significant.")

____
Comparing the two scaling methods in Modeling and Evaluation 5

<a href="#top">Back to Top</a>
<a id="ScalerEval"></a>
##### Now for determining if there is a statistically significant difference between transformation types StandardScalar & QuantileTransformation

<a id="RFEval"></a>
Random Forest Models

In [None]:
# StandardScalar transformed accuracy scores
print(rf_model_scores1) # share_quantile_ranges task
print(rf_model_scores2) # day_of_week task
print(rf_model_scores3) # news_category task

# QuantileTransformer transformed accuracy scores
print(rf_model_scores_q1) # share_quantile_ranges task
print(rf_model_scores_q2) # day_of_week task
print(rf_model_scores_q3) # news_category task

In [None]:
# Load the model accuracy scores for StandardScalar transformed
model1_scores = rf_model_scores1
model2_scores = rf_model_scores2
model3_scores = rf_model_scores3

# Load the model accuracy scores for QuantileTransformer transformed
model4_scores = rf_model_scores_q1
model5_scores = rf_model_scores_q2
model6_scores = rf_model_scores_q3

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model4_scores)
print("Random Forest Models")
print("StandardScalar vs. QuantileTransformation on share_quantile_ranges task")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value2 = ttest_rel(model2_scores, model5_scores)

print("StandardScalar vs. QuantileTransformation on day_of_week task")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value3 = ttest_rel(model3_scores, model6_scores)

print("StandardScalar vs. QuantileTransformation on news_category task")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

<a href="#top">Back to Top</a>
<a id="KNNEval"></a>

KNN models

In [None]:
# StandardScalar transformed accuracy scores
print(knn_model_scores1) # share_quantile_ranges task
print(knn_model_scores2) # day_of_week task
print(knn_model_scores3) # news_category task

# QuantileTransformer transformed accuracy scores
print(knn_model_scores_q1) # share_quantile_ranges task
print(knn_model_scores_q2) # day_of_week task
print(knn_model_scores_q3) # news_category task

In [None]:
# Load the model accuracy scores for StandardScalar transformed
model1_scores = knn_model_scores1
model2_scores = knn_model_scores2
model3_scores = knn_model_scores3

# Load the model accuracy scores for QuantileTransformer transformed
model4_scores = knn_model_scores_q1
model5_scores = knn_model_scores_q2
model6_scores = knn_model_scores_q3

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model4_scores)
print("Random Forest Models")
print("StandardScalar vs. QuantileTransformation on share_quantile_ranges task")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value2 = ttest_rel(model2_scores, model5_scores)

print("StandardScalar vs. QuantileTransformation on day_of_week task")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value3 = ttest_rel(model3_scores, model6_scores)

print("StandardScalar vs. QuantileTransformation on news_category task")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

<a href="#top">Back to Top</a>
<a id="SVMEval"></a>

SVM models

In [None]:
# StandardScalar transformed accuracy scores
print(svm_model_scores1) # share_quantile_ranges task
print(svm_model_scores2) # day_of_week task
# print(svm_model_scores1) # news_category task

# QuantileTransformer transformed accuracy scores
print(svm_model_scores_q1) # share_quantile_ranges task
print(svm_model_scores_q2) # day_of_week task
# print(svm_model_scores_q3) # news_category task

In [None]:
# Load the model accuracy scores for StandardScalar transformed
model1_scores = svm_model_scores1
model2_scores = svm_model_scores2
# model3_scores = svm_model_scores3

# Load the model accuracy scores for QuantileTransformer transformed
model4_scores = svm_model_scores_q1
model5_scores = svm_model_scores_q2
# model6_scores = svm_model_scores_q3

# Perform a paired t-test
t_statistic, p_value1 = ttest_rel(model1_scores, model4_scores)
print("Random Forest Models")
print("StandardScalar vs. QuantileTransformation on share_quantile_ranges task")
# Check if the difference is significant with 95% confidence
if p_value1 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# Perform a paired t-test
t_statistic, p_value2 = ttest_rel(model2_scores, model5_scores)

print("StandardScalar vs. QuantileTransformation on day_of_week task")
# Check if the difference is significant with 95% confidence
if p_value2 < 0.05:
  print("The difference between the two models is statistically significant.")
else:
  print("The difference between the two models is not statistically significant.")

print('---------------------------------------------------------')

# # Perform a paired t-test
# t_statistic, p_value3 = ttest_rel(model3_scores, model6_scores)

# print("StandardScalar vs. QuantileTransformation on news_category task")
# # Check if the difference is significant with 95% confidence
# if p_value2 < 0.05:
#   print("The difference between the two models is statistically significant.")
# else:
#   print("The difference between the two models is not statistically significant.")

### One idea: grid search parameters in a parallelized fashion and visualize the performances across attributes. Which parameters are most significant for making a good model for each classification algorithm