In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# this import allows you train and test you test split
from sklearn.model_selection import train_test_split
# this import allows you to standardize your data, scaling so that all features have a mean of zero and a standard deviation of 1. 
from sklearn.preprocessing import StandardScaler
# this import allows you to create a logistic regression model; type of machine learning model that can be used for classification tasks 
from sklearn.linear_model import LogisticRegression
# this import allows you to create a support vector machine SVM model, a type of ML model that can be used for classification tasks. 
from sklearn.svm import SVC
# this import allows you to perform CV on your model, a technique for evaluating the performance of a ML on unseen data
from sklearn.model_selection import cross_val_score
# these imports allow you to calculate various evaluation metrics for your ML model. Eval metrics are used to asses the performance of a ML on held-out test set. 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

### In this mini-project the team decided to reuse the Mashable dataset from Project 1. As you recall the data was gathered between 2013-2014 in order to learn more about their readers in the hopes of creating more ad revenue. The data collected monitored a range quantified features such as sentiment, polarity, and number of shares per article.

### Here we load in our data and drop a few other columns that are categorical that will not be used in our models. 


In [5]:
# file path
filepath = "../1 - Visualization and Data Preprocessing/Data/ONPClean2.csv"
# Load the dataset
df = pd.read_csv(filepath)

# Set the maximum number of columns to display to None
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,url_name,date,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,day_of_week,news_category,year,month,log_shares,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,amazon-instant-video-browser/,2013-01-07,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,Monday,Entertainment,2013,1,6.386879,5.393628,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,reeddit-reddit/,2013-01-07,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022265,0.022446,0.022276,0.251465,0.681548,0.381987,0.152189,0.038462,0.007692,0.833333,0.166667,0.353939,0.033333,0.7,-0.4,-0.4,-0.4,0.25,0.2,0.25,0.2,1300,Monday,Tech,2013,1,7.170888,4.875197,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,rage-comics-dying/,2013-01-07,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028575,0.199626,0.028615,0.714611,0.028572,0.54258,0.12237,0.063291,0.025316,0.714286,0.285714,0.357269,0.05,0.6,-0.338889,-1.0,-0.05,0.65,-0.5,0.15,0.5,1100,Monday,Uncategorized,2013,1,7.003974,6.163315,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,power-matters-alliance-organization/,2013-01-07,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020011,0.020317,0.117255,0.020007,0.82241,0.425089,0.128515,0.03964,0.012613,0.758621,0.241379,0.337965,0.05,0.7,-0.225794,-0.4,-0.125,0.5,-0.1,0.0,0.1,1600,Monday,Tech,2013,1,7.378384,6.320768,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,polaroid-android-camera/,2013-01-07,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025001,0.327017,0.025001,0.025001,0.597981,0.50652,0.279769,0.071749,0.013453,0.842105,0.157895,0.417055,0.1,1.0,-0.212354,-0.5,-0.05,0.333333,0.25,0.166667,0.25,2400,Monday,Tech,2013,1,7.783641,7.017506,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


In [6]:
# drop certain columns
df1 = df.drop('url_name', axis=1) # was a string
df1 = df1.drop('date', axis=1) # datetime change didnt even work.
df1 = df1.drop('day_of_week', axis=1) # other categorical variable
df1 = df1.drop('news_category', axis=1) # other categorical variable

df1.head()


Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,year,month,log_shares,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,2013,1,6.386879,5.393628,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022265,0.022446,0.022276,0.251465,0.681548,0.381987,0.152189,0.038462,0.007692,0.833333,0.166667,0.353939,0.033333,0.7,-0.4,-0.4,-0.4,0.25,0.2,0.25,0.2,1300,2013,1,7.170888,4.875197,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028575,0.199626,0.028615,0.714611,0.028572,0.54258,0.12237,0.063291,0.025316,0.714286,0.285714,0.357269,0.05,0.6,-0.338889,-1.0,-0.05,0.65,-0.5,0.15,0.5,1100,2013,1,7.003974,6.163315,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020011,0.020317,0.117255,0.020007,0.82241,0.425089,0.128515,0.03964,0.012613,0.758621,0.241379,0.337965,0.05,0.7,-0.225794,-0.4,-0.125,0.5,-0.1,0.0,0.1,1600,2013,1,7.378384,6.320768,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025001,0.327017,0.025001,0.025001,0.597981,0.50652,0.279769,0.071749,0.013453,0.842105,0.157895,0.417055,0.1,1.0,-0.212354,-0.5,-0.05,0.333333,0.25,0.166667,0.25,2400,2013,1,7.783641,7.017506,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


### For the logistic regression model we take the number of shares per article as our response variable and categorize them by range. Shares equate to popularity and by understanding which features are needed to create an accurate model that can predict whether an article is popular or not may provide some insight into what about the article is driving popularity. Mashable could then share the findings with their journalists in order to adjust to their readers and hopefully create more ad revenue.


In [7]:
# Describe the `shares` column
df1['shares'].describe()

count     39644.000000
mean       3395.380184
std       11626.950749
min           1.000000
25%         946.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

### Now that we have our new categorical column 'share_ranges' some data cleaning and sanity checks are done to avoid errors as we create our model. 

In [8]:
# Create a new column called `share_ranges` with categorical levels
df1['share_ranges'] = pd.cut(df1['shares'], bins=[0, 2500, 5000, 7500, 10000, 20000, 100000, 1000000], labels=['<2500', '>2500 & <5000', '>5000 & <7500', '>7500 & <10000', '>10000 & <20000', '>20000 & <100000', '>100000'])

# Print the DataFrame
df1.head()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,year,month,log_shares,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess,share_ranges
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,2013,1,6.386879,5.393628,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859,<2500
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022265,0.022446,0.022276,0.251465,0.681548,0.381987,0.152189,0.038462,0.007692,0.833333,0.166667,0.353939,0.033333,0.7,-0.4,-0.4,-0.4,0.25,0.2,0.25,0.2,1300,2013,1,7.170888,4.875197,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888,<2500
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028575,0.199626,0.028615,0.714611,0.028572,0.54258,0.12237,0.063291,0.025316,0.714286,0.285714,0.357269,0.05,0.6,-0.338889,-1.0,-0.05,0.65,-0.5,0.15,0.5,1100,2013,1,7.003974,6.163315,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<2500
3,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020011,0.020317,0.117255,0.020007,0.82241,0.425089,0.128515,0.03964,0.012613,0.758621,0.241379,0.337965,0.05,0.7,-0.225794,-0.4,-0.125,0.5,-0.1,0.0,0.1,1600,2013,1,7.378384,6.320768,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135,<2500
4,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025001,0.327017,0.025001,0.025001,0.597981,0.50652,0.279769,0.071749,0.013453,0.842105,0.157895,0.417055,0.1,1.0,-0.212354,-0.5,-0.05,0.333333,0.25,0.166667,0.25,2400,2013,1,7.783641,7.017506,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199,<2500


In [9]:
# Describe the `shares` column
print(df1['share_ranges'].value_counts())
print(df1['share_ranges'].dtype)

<2500               28778
>2500 & <5000        5794
>5000 & <7500        1920
>10000 & <20000      1367
>7500 & <10000        967
>20000 & <100000      760
>100000                58
Name: share_ranges, dtype: int64
category


In [10]:
df1.dropna()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,year,month,log_shares,log_n_tokens_content,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess,share_ranges
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.100000,0.70,-0.350000,-0.600000,-0.200000,0.500000,-0.1875,0.000000,0.1875,593,2013,1,6.386879,5.393628,1.609438,1.098612,0.693147,0.000000,0.000000,0.000000,0.000000,6.208590,6.208590,6.208590,<2500
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.022265,0.022446,0.022276,0.251465,0.681548,0.381987,0.152189,0.038462,0.007692,0.833333,0.166667,0.353939,0.033333,0.70,-0.400000,-0.400000,-0.400000,0.250000,0.2000,0.250000,0.2000,1300,2013,1,7.170888,4.875197,2.079442,1.609438,0.000000,0.000000,0.000000,0.000000,0.000000,7.170888,7.170888,7.170888,<2500
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.028575,0.199626,0.028615,0.714611,0.028572,0.542580,0.122370,0.063291,0.025316,0.714286,0.285714,0.357269,0.050000,0.60,-0.338889,-1.000000,-0.050000,0.650000,-0.5000,0.150000,0.5000,1100,2013,1,7.003974,6.163315,2.484907,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,<2500
3,731.0,10.0,0.535390,5.147748,10.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.020011,0.020317,0.117255,0.020007,0.822410,0.425089,0.128515,0.039640,0.012613,0.758621,0.241379,0.337965,0.050000,0.70,-0.225794,-0.400000,-0.125000,0.500000,-0.1000,0.000000,0.1000,1600,2013,1,7.378384,6.320768,2.079442,1.945910,0.693147,0.000000,0.000000,0.000000,0.000000,7.550135,7.550135,7.550135,<2500
4,731.0,9.0,0.424132,4.631390,8.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.025001,0.327017,0.025001,0.025001,0.597981,0.506520,0.279769,0.071749,0.013453,0.842105,0.157895,0.417055,0.100000,1.00,-0.212354,-0.500000,-0.050000,0.333333,0.2500,0.166667,0.2500,2400,2013,1,7.783641,7.017506,3.091042,3.091042,3.044522,0.000000,0.000000,0.000000,0.000000,6.302619,9.680406,8.140199,<2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,9.0,12.0,0.567227,4.313253,5.0,-1.0,42.600000,843300.0,571200.000000,2170.324903,3385.393320,0.0,0.040020,0.040004,0.040008,0.040000,0.839967,0.440992,0.266721,0.040161,0.008032,0.833333,0.166667,0.385909,0.136364,1.00,-0.145833,-0.166667,-0.125000,0.000000,0.0000,0.500000,0.0000,1100,2014,12,7.003974,5.521461,1.609438,1.098612,0.693147,0.693147,5.384495,9.994288,7.967156,8.071219,9.179984,8.771990,<2500
39640,9.0,13.0,0.570136,4.589286,10.0,-1.0,511.000000,843300.0,310130.000000,1500.000000,3900.000000,0.0,0.020009,0.219217,0.020005,0.118088,0.622681,0.384271,0.197662,0.044643,0.004464,0.909091,0.090909,0.348636,0.100000,0.50,-0.071429,-0.071429,-0.071429,0.800000,0.4000,0.300000,0.4000,1400,2014,12,7.244942,5.416100,2.079442,2.079442,0.693147,0.693147,8.131825,7.313887,7.946497,7.003974,7.550135,7.424165,<2500
39641,9.0,12.0,0.514925,4.263403,7.0,-1.0,525.000000,843300.0,224885.714286,1880.000000,6433.333333,0.0,0.028572,0.172060,0.028572,0.028572,0.742224,0.434468,0.169252,0.039627,0.016317,0.708333,0.291667,0.391176,0.166667,0.75,-0.179847,-0.312500,-0.025000,0.000000,0.0000,0.500000,0.0000,3200,2014,12,8.071219,6.063785,1.386294,1.386294,1.386294,0.000000,7.378384,8.366603,8.083845,7.170888,7.170888,7.170888,>2500 & <5000
39642,9.0,15.0,0.506261,5.005172,7.0,-1.0,88.857143,843300.0,266628.571429,1558.755814,4966.668990,0.0,0.028579,0.028573,0.792900,0.028571,0.121376,0.428833,0.188667,0.025862,0.008621,0.750000,0.250000,0.407333,0.100000,0.80,-0.115000,-0.125000,-0.100000,0.500000,0.5000,0.000000,0.5000,1700,2014,12,7.438972,6.364751,2.772589,1.098612,1.386294,0.000000,6.104793,9.752723,7.912769,7.244942,7.244942,7.244942,<2500


In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 49 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   timedelta                       39644 non-null  float64 
 1   n_tokens_title                  39644 non-null  float64 
 2   n_unique_tokens                 39644 non-null  float64 
 3   average_token_length            39644 non-null  float64 
 4   num_keywords                    39644 non-null  float64 
 5   kw_min_min                      39644 non-null  float64 
 6   kw_avg_min                      39644 non-null  float64 
 7   kw_max_max                      39644 non-null  float64 
 8   kw_avg_max                      39644 non-null  float64 
 9   kw_min_avg                      39644 non-null  float64 
 10  kw_max_avg                      39644 non-null  float64 
 11  is_weekend                      39644 non-null  float64 
 12  LDA_00            

### Next the data is split 80/20 training/testing splits to be in our machine learning models. The 80/20 split was chosen as it is a common practice in machine learning and is used to prevent overfitting. 

### To avoid redundancy our response variable 'share_ranges' and 'log shares' are removed from the dataset before the split. To improve the performance of our models, the data splits were scaled so that features are normalized to reduce the impact of features with large ranges of values. 

In [12]:

X = df1.drop(['share_ranges', 'log_shares'], axis=1)
y = df1['share_ranges']
print(X.columns)
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Scale the features in the training and testing sets.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Index(['timedelta', 'n_tokens_title', 'n_unique_tokens',
       'average_token_length', 'num_keywords', 'kw_min_min', 'kw_avg_min',
       'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'is_weekend',
       'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negative_words', 'rate_positive_words',
       'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
       'max_positive_polarity', 'avg_negative_polarity',
       'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
       'title_sentiment_polarity', 'abs_title_subjectivity',
       'abs_title_sentiment_polarity', 'shares', 'year', 'month',
       'log_n_tokens_content', 'log_num_hrefs', 'log_num_self_hrefs',
       'log_num_imgs', 'log_num_videos', 'log_kw_max_min', 'log_kw_min_max',
       'log_kw_avg_avg', 'log_self_reference_min_shares',
       'log_self_reference_max_shares

In [13]:
# Sanity Check:  Check for NaNs in the dataset
for column in df1.columns:
    if df1[column].isnull().any():
        print('NaNs found in column:', column)

## Logistic Regression Model: 

### Tuning: 
### The model parameters are set below to have regularlization strength of 0.5 using the Lasso method (robust to noise, penalizes the abs value). The Stochastic Average Gradient Aggregation (saga) is used to train the logistic model with the max iterations to train the model at 10000. 
### 

### Model Advantages - Efficiency: 
### The models convergence rate was ~3 minutes. This is fast indicating our model is not all that complex. 

### Model Advantages - Performance
### The logistic model performed well with an accuracy score of 0.9236978181359566. This is quite good compared to our other models. The model also had a relatively high precision score of 0.8666999401751474, so not too many false positives. A logistic regression recall of 0.6653267287562078 and a Logistic regression F1 score of 0.6989320153829033 are also decent metrics. Overall the model performed quite well in accurately classifying which articles were popular or not. 


### Interpret Feature Importance - Weighted Coefficients Explanation:
### The weighted coefficients indicate that there are number of features with slight either negative or positive correlations with the response variable, however absolute subjectivity level stands out with a significant negative correlation with the response variable. This means that the more articles with higher subjective title rating are less likely to be popular. The weight for absolute subjectivity level is much larger than the other weighted coefficients which is another indication of how strong the correlation with the response variable is compared to that of other features. 


### Weights:
#### abs_title_subjectivity has weight of -40.002251666066044 -- significant negative correlation 
#### n_tokens_title has weight of 0.04345191046903477 - a slight positive correlation 
#### kw_max_max has weight of -0.0830884140822451 - slight negative correlation 
#### kw_avg_max has weight of -0.006586796170454732 -- slight negative correlation 
#### kw_max_avg has weight of -0.0038725518653367702 -- slight negative correlation 
#### LDA_00 has weight of 0.009177095489831295 -- slight positive correlation 
#### LDA_03 has weight of -0.0020298069288597516 -- slight negative correlation 
#### rate_negative_words has weight of 0.0050291038262997275 -- slight positive correlation 
#### max_positive_polarity has weight of 0.00047534585207426207 -- slight positive correlation 
#### title_subjectivity has weight of -0.029442231869322325 -- slight negative correlation 
#### title_sentiment_polarity has weight of -0.03335660022306888 -- slight negative correlation 
#### abs_title_sentiment_polarity has weight of 0.012962789739766606 -- slight positive correlation 
#### news_category has weight of -0.012658141079296227 -- slight negative correlation 
#### log_num_hrefs has weight of 0.07071379633299095 -- slight positive correlation 
#### log_num_self_hrefs has weight of -0.041213652666091616 - slight negative correlation 
#### log_num_imgs has weight of -0.031108272502564076 -- slight negative correlation 


In [14]:
# Set the model parameters.
C = 0.05
penalty = 'l1'
solver = 'saga'
max_iter=10000

# Create a logistic regression model.
logistic_regression_model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=max_iter)
# Fit the model to the training data.
logistic_regression_model.fit(X_train, y_train)

# Make predictions on the test data.
y_pred = logistic_regression_model.predict(X_test)

# Get the weights from the trained model.
weights = logistic_regression_model.coef_.T

# Print the weight of each variable.
for weight, variable_name in zip(weights, df.columns[:-1]):
    print(f'{variable_name} has weight of {weight[0]}')

url_name has weight of 0.0
date has weight of 0.0
timedelta has weight of 0.0
n_tokens_title has weight of 0.04345191046903477
n_unique_tokens has weight of 0.0
average_token_length has weight of 0.0
num_keywords has weight of 0.0
kw_min_min has weight of 0.0
kw_avg_min has weight of 0.0
kw_max_max has weight of -0.0830884140822451
kw_avg_max has weight of -0.006586796170454732
kw_min_avg has weight of 0.0
kw_max_avg has weight of -0.0038725518653367702
is_weekend has weight of 0.0
LDA_00 has weight of 0.009177095489831295
LDA_01 has weight of 0.0
LDA_02 has weight of 0.0
LDA_03 has weight of -0.0020298069288597516
LDA_04 has weight of 0.0
global_subjectivity has weight of 0.0
global_sentiment_polarity has weight of 0.0
global_rate_positive_words has weight of 0.0
global_rate_negative_words has weight of 0.0
rate_positive_words has weight of 0.0
rate_negative_words has weight of 0.0050291038262997275
avg_positive_polarity has weight of 0.0
min_positive_polarity has weight of 0.0
max_po

In [15]:
logistic_regression_accuracy = accuracy_score(y_test, y_pred)
logistic_regression_precision = precision_score(y_test, y_pred, average='macro')
logistic_regression_recall = recall_score(y_test, y_pred, average='macro')
logistic_regression_f1_score = f1_score(y_test, y_pred, average='macro')
# logistic_regression_roc_auc_score = roc_curve(y_test, y_pred[:, 1])


# Print the metrics.
print('Logistic regression accuracy:', logistic_regression_accuracy)
print('Logistic regression precision:', logistic_regression_precision)
print('Logistic regression recall:', logistic_regression_recall)
print('Logistic regression F1 score:', logistic_regression_f1_score)

Logistic regression accuracy: 0.9236978181359566
Logistic regression precision: 0.8666999401751474
Logistic regression recall: 0.6653267287562078
Logistic regression F1 score: 0.6989320153829033


### Support Vector Machine Model:

### Tuning: 
### The model parameters are set below to have regularlization strength of 1.0, a linear kernal is used to train the SVM with the gamma set to 0.1 to avoid overfitting (a higher gamma more prone to overfitting as makes the decision boundary more complex). 

### Model Advantages - Efficiency: 
### The models convergence rate was ~14.4 seconds. This is fast indicating our model is not complex.  

### Model Advantages - Performance
### The logistic model performed extremely well with an accuracy score of 0.9860007567158532. This is quite good better than the logistic regression model. The model also had a high precision score of 0.9552859295192899, so not too many false positives. A logistic regression recall of 0.9381806456213155 and a Logistic regression F1 score of 0.9463855789589289 are also good metrics. Overall the model performed very well in accurately classifying which articles were popular or not. 

### Interpret Feature Importance - Weighted Coefficients Explanation:
### Just as in the previous logistic regression model, the SVM weighted coefficients indicate that there are number of features with slight either negative or positive correlations with the response variable, however absolute subjectivity level stands out with a negative correlation with the response variable. This means that the more articles with higher subjective title rating are less likely to be popular. The weight for absolute subjectivity level is much larger than the other weighted coefficients which is another indication of how strong the correlation with the response variable is compared to that of other features.

### Weights: 
#### abs_title_subjectivity has weight of -3.2124239921622424 -- negative correlation
#### url_name has weight of -0.0002477970466832069 -- slight negative correlation 
#### date has weight of -4.39394437103946e-05 -- slight negative correlation 
#### timedelta has weight of -0.0191769821289276 -- slight negative correlation 
#### n_tokens_title has weight of 4.493937662020109e-05 -- slight positive correlation 
#### n_unique_tokens has weight of 1.5327936830900057e-05 -- slight positive correlation 
#### average_token_length has weight of 7.495918981410832e-05 -- slight negative correlation 
#### num_keywords has weight of 0.00010830315772131227 -- slight positive correlation 
#### kw_min_min has weight of -0.00012821209954989865 -- slight negative correlation 
#### kw_avg_min has weight of 7.950518713450982e-05 -- slight positive correlation 
#### kw_max_max has weight of 0.00020965278001855436 -- slight positive correlation 
#### kw_avg_max has weight of 0.00010072277917916317 -- slight positive correlation 
#### kw_min_avg has weight of -1.6646353786842205e-05 -- slight negative correlation 
#### kw_max_avg has weight of -4.983241174549846e-05 -- slight negative correlation 
#### is_weekend has weight of 5.22136165903575e-05 -- slight positive correlation 
#### LDA_00 has weight of 6.715899686149385e-06 -- slight positive correlation 
#### LDA_01 has weight of -3.15658796556928e-05 -- slight negative correlation 
#### LDA_02 has weight of 3.151243565957529e-05 -- slight positive correlation 
#### LDA_03 has weight of -4.9260978921439325e-05 -- slight negative correlation 
#### LDA_04 has weight of 1.984912670766059e-05 -- slight positive correlation 
#### global_subjectivity has weight of 3.9704001419726964e-05 -- slight positive correlation 
#### global_sentiment_polarity has weight of 3.0220859169177716e-05 -- slight positive correlation 
#### global_rate_positive_words has weight of 0.00116357875010481 -- slight positive correlation 
#### global_rate_negative_words has weight of 0.0009168286734000386 -- slight positive correlation 
#### rate_positive_words has weight of -6.196459336210713e-05 -- slight negative correlation 
#### rate_negative_words has weight of 8.7124657542903e-05 -- slight positive correlation 
#### avg_positive_polarity has weight of 0.00010739256754516147 -- slight positive correlation 
#### min_positive_polarity has weight of -2.555434414699964e-05 -- slight negative correlation 
#### max_positive_polarity has weight of -9.232031633643611e-05 -- slight negative correlation 
#### avg_negative_polarity has weight of 4.210584788089111e-05 -- slight negative correlation 
#### min_negative_polarity has weight of -3.960652437751122e-05 -- slight negative correlation 
#### max_negative_polarity has weight of 4.231652938280206e-06 -- slight positive correlation
#### title_subjectivity has weight of -4.136044504865488e-05 -- slight negative correlation 
#### title_sentiment_polarity has weight of 2.041336979985431e-05 -- slight positive correlation
#### abs_title_sentiment_polarity has weight of -0.0001411324053754992 -- slight negative correlation
#### shares has weight of 6.559283931117932e-06 -- slight positive correlation
#### day_of_week has weight of -0.0009180579626699714 -- slight negative correlation
#### news_category has weight of 0.000159769646074448 -- slight positive correlation
#### year has weight of -3.0124282420662674e-05 -- slight negative correlation
#### month has weight of -9.601022543903603e-05 -- slight negative correlation
#### log_shares has weight of -1.6978884219254198e-05 -- slight negative correlation
#### log_n_tokens_content has weight of -0.0001350102171213674 -- slight negative correlation
#### log_num_hrefs has weight of -0.00017773739868343075 -- slight negative correlation
#### log_num_self_hrefs has weight of -8.007039585322673e-05 -- slight negative correlation
#### log_num_imgs has weight of 0.00014782913837296796 -- slight positive correlation
#### log_num_videos has weight of 0.0014105045214556355 -- slight positive correlation 
#### log_kw_max_min has weight of -0.0015946337719641157 -- slight negative correlation 

In [25]:
# Set the model parameters
C = 1.0
# We cannot use rbf because it is a nonlinear kernal 
# kernel = 'rbf'
kernel = 'linear'
gamma = 0.1

# Create the support vector machine model
support_vector_machine_model = SVC(C=C, kernel=kernel, gamma=gamma)

# Train the support vector machine model on the training set using a linear kernel.
support_vector_machine_model.fit(X_train, y_train)
# Make predictions on the test data.
y_pred = support_vector_machine_model.predict(X_test)

weights = support_vector_machine_model.coef_.T

# # Print the weight of each variable.
for weight, variable_name in zip(weights, df.columns[:-1]):
    print(f'{variable_name} has weight of {weight[0]}')


url_name has weight of -0.0002477970466832069
date has weight of -4.39394437103946e-05
timedelta has weight of -0.0191769821289276
n_tokens_title has weight of 4.493937662020109e-05
n_unique_tokens has weight of 1.5327936830900057e-05
average_token_length has weight of 7.495918981410832e-05
num_keywords has weight of 0.00010830315772131227
kw_min_min has weight of -0.00012821209954989865
kw_avg_min has weight of 7.950518713450982e-05
kw_max_max has weight of 0.00020965278001855436
kw_avg_max has weight of 0.00010072277917916317
kw_min_avg has weight of -1.6646353786842205e-05
kw_max_avg has weight of -4.983241174549846e-05
is_weekend has weight of 5.22136165903575e-05
LDA_00 has weight of 6.715899686149385e-06
LDA_01 has weight of -3.15658796556928e-05
LDA_02 has weight of 3.151243565957529e-05
LDA_03 has weight of -4.9260978921439325e-05
LDA_04 has weight of 1.984912670766059e-05
global_subjectivity has weight of 3.9704001419726964e-05
global_sentiment_polarity has weight of 3.0220859

In [26]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data.
support_vector_machine_accuracy = accuracy_score(y_test, y_pred)
support_vector_machine_precision = precision_score(y_test, y_pred, average='macro')
support_vector_machine_recall = recall_score(y_test, y_pred, average='macro')
support_vector_machine_f1_score = f1_score(y_test, y_pred, average='macro')
# support_vector_machine_roc_auc_score = roc_auc_score(y_test, y_pred, average='macro')

print('---------------------------------------------------------')

# Print the metrics.
print('Support vector machine accuracy:', support_vector_machine_accuracy)

print('Support vector machine precision:', support_vector_machine_precision)
print('Support vector machine recall:', support_vector_machine_recall)
print('Support vector machine F1 score:', support_vector_machine_f1_score)
# print('Support vector machine ROC AUC score:', support_vector_machine_roc_auc_score)

---------------------------------------------------------
Support vector machine accuracy: 0.9860007567158532
Support vector machine precision: 0.9552859295192899
Support vector machine recall: 0.9381806456213155
Support vector machine F1 score: 0.9463855789589289


### SVM using SGD: ???? NEED TO Evaluate this section ??? SVM - SGD performed worse than the SVM model. 

### Tuning: 
### ### Ridge regularlization was used on teh SVM_SGD Model as Lasso shrunk all the weights to zero 
### Lasso l1, the model has weights 
### alpha 0.0001 versus 1 -- changes metrics 

### Model Advantages - Efficiency: 
### The models convergence rate was ~23.5 seconds. This is fast indicating our model is not complex.  

### Model Advantages - Performance
### The logistic model performed extremely well with an accuracy score of 0.814226258040106. This is quite good better than the logistic regression model. The model also had a high precision score of 0.4979519020952421, so not too many false positives. A logistic regression recall of 0.4592501273976919 and a Logistic regression F1 score of 0.45320351649627905 are also good metrics. Overall the model performed very well in accurately classifying which articles were popular or not. 
Support vector machine using SGD accuracy: 0.8122083490982469
Support vector machine using SGD precision: 0.5037453755087606
Support vector machine using SGD recall: 0.4628526565447935
Support vector machine using SGD F1 score: 0.45440550615686714


### Interpret Feature Importance - Weighted Coefficients Explanation:
### Just as in the previous logistic regression model, the SVM weighted coefficients indicate that there are number of features with slight either negative or positive correlations with the response variable, however absolute subjectivity level stands out with a negative correlation with the response variable. This means that the more articles with higher subjective title rating are less likely to be popular. The weight for absolute subjectivity level is much larger than the other weighted coefficients which is another indication of how strong the correlation with the response variable is compared to that of other features.


### Weights: 
#### url_name has weight of -0.22080652042033538
#### date has weight of -0.007595301676654267
#### timedelta has weight of -0.18988023109706784
#### n_tokens_title has weight of 0.06180432150684208
#### n_unique_tokens has weight of 0.002297777107482027
#### average_token_length has weight of -0.015564282053148871
#### num_keywords has weight of -0.01677362395921804
#### kw_min_min has weight of -0.03574050396518777
#### kw_avg_min has weight of 0.01284274194595419
#### kw_max_max has weight of -0.10172491579455416
#### kw_avg_max has weight of -0.008024281483641928
#### kw_min_avg has weight of 0.0017795427769386198
#### kw_max_avg has weight of 0.0052608863049452064
#### is_weekend has weight of -0.0022931096309335196
#### LDA_00 has weight of -0.02508134994649698
#### LDA_01 has weight of 0.013626503563109276
#### LDA_02 has weight of 0.008578510506548004
#### LDA_03 has weight of -0.039078402223844
#### LDA_04 has weight of 0.03929671710866691
#### global_subjectivity has weight of 0.0267148363935909
#### global_sentiment_polarity has weight of -0.04036971725499696
#### global_rate_positive_words has weight of -0.0437937758694026
#### global_rate_negative_words has weight of 0.0284408146367553
#### rate_positive_words has weight of -0.010457974560344705
#### rate_negative_words has weight of 0.009531619078503514
#### avg_positive_polarity has weight of -0.031002443143928273
#### min_positive_polarity has weight of -0.003814799206646656
#### max_positive_polarity has weight of -8.004172743040603e-05
#### avg_negative_polarity has weight of -0.003912544983939632
#### min_negative_polarity has weight of 0.01803603316203229
#### max_negative_polarity has weight of 0.006662030072321132
#### title_subjectivity has weight of -0.02256924361898776
#### title_sentiment_polarity has weight of -0.04740533717369407
#### abs_title_subjectivity has weight of -23.79448560162115
#### abs_title_sentiment_polarity has weight of -0.1669441901736163
#### shares has weight of -0.12540709638301883
#### day_of_week has weight of 0.05538718894703649
#### news_category has weight of -0.028254025919727865
#### year has weight of 0.01619159265421823
#### month has weight of 0.02235037953280418
#### log_shares has weight of 0.026116347826609695
#### log_n_tokens_content has weight of 0.016996274710331376
#### log_num_hrefs has weight of 0.08836445968013068
#### log_num_self_hrefs has weight of -0.007762766267592073
#### log_num_imgs has weight of -0.005799789526101167
#### log_num_videos has weight of 0.1739530502727808
#### log_kw_max_min has weight of -0.22503909337583722


In [31]:
from sklearn.linear_model import SGDClassifier
# Set the model parameters
alpha = 0.0001
fit_intercept = True
l1_ratio = 0.0
learning_rate = 'optimal'
loss = 'hinge' # gives a linear SVM 
n_iter_no_change = 10000
# Ridge 
penalty = 'l2'
# Lasso 
# penalty = 'l1'

# Initialize the SVM model.
support_vector_machine_model_sgd = SGDClassifier(alpha=alpha,fit_intercept=fit_intercept, l1_ratio=l1_ratio, learning_rate=learning_rate, loss=loss, n_iter_no_change=n_iter_no_change, penalty=penalty)

# Train the support vector machine model on the training set using a linear kernel.
support_vector_machine_model_sgd.fit(X_train, y_train)
# Make predictions on the test data.
y_pred = support_vector_machine_model_sgd.predict(X_test)

# Get the weights from the trained model.
weights = support_vector_machine_model_sgd.coef_.T

# Print the weight of each variable.
for weight, variable_name in zip(weights, df.columns[:-1]):
    print(f'{variable_name} has weight of {weight[0]}')

url_name has weight of -0.22080652042033538
date has weight of -0.007595301676654267
timedelta has weight of -0.18988023109706784
n_tokens_title has weight of 0.06180432150684208
n_unique_tokens has weight of 0.002297777107482027
average_token_length has weight of -0.015564282053148871
num_keywords has weight of -0.01677362395921804
kw_min_min has weight of -0.03574050396518777
kw_avg_min has weight of 0.01284274194595419
kw_max_max has weight of -0.10172491579455416
kw_avg_max has weight of -0.008024281483641928
kw_min_avg has weight of 0.0017795427769386198
kw_max_avg has weight of 0.0052608863049452064
is_weekend has weight of -0.0022931096309335196
LDA_00 has weight of -0.02508134994649698
LDA_01 has weight of 0.013626503563109276
LDA_02 has weight of 0.008578510506548004
LDA_03 has weight of -0.039078402223844
LDA_04 has weight of 0.03929671710866691
global_subjectivity has weight of 0.0267148363935909
global_sentiment_polarity has weight of -0.04036971725499696
global_rate_positi



In [32]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data.
support_vector_machine_accuracy_sgd = accuracy_score(y_test, y_pred)
support_vector_machine_precision_sgd = precision_score(y_test, y_pred, average='macro')
support_vector_machine_recall_sgd = recall_score(y_test, y_pred, average='macro')
support_vector_machine_f1_score_sgd= f1_score(y_test, y_pred, average='macro')
# support_vector_machine_roc_auc_score = roc_auc_score(y_test, y_pred, average='macro')

print('---------------------------------------------------------')

# Print the metrics.
print('Support vector machine using SGD accuracy:', support_vector_machine_accuracy_sgd)
print('Support vector machine using SGD precision:', support_vector_machine_precision_sgd)
print('Support vector machine using SGD recall:', support_vector_machine_recall_sgd)
print('Support vector machine using SGD F1 score:', support_vector_machine_f1_score_sgd)
# print('Support vector machine ROC AUC score:', support_vector_machine_roc_auc_score)

---------------------------------------------------------
Support vector machine using SGD accuracy: 0.8122083490982469
Support vector machine using SGD precision: 0.5037453755087606
Support vector machine using SGD recall: 0.4628526565447935
Support vector machine using SGD F1 score: 0.45440550615686714


### Logistic Regression using SGD ???? NEED TO Evaluate this section and update the information here???

### Tuning: 
### The model parameters are set below to have regularlization strength of 1.0, a linear kernal is used to train the SVM with the gamma set to 0.1 to avoid overfitting (a higher gamma more prone to overfitting as makes the decision boundary more complex). 

### Model Advantages - Efficiency: 
### The models convergence rate was ~36.7 seconds. This is fast indicating our model is not complex.  

### Model Advantages - Performance
### The logistic model performed extremely well with an accuracy score of 0.9860007567158532. This is quite good better than the logistic regression model. The model also had a high precision score of 0.9552859295192899, so not too many false positives. A logistic regression recall of 0.9381806456213155 and a Logistic regression F1 score of 0.9463855789589289 are also good metrics. Overall the model performed very well in accurately classifying which articles were popular or not. 
Logistic regression using SGD accuracy: 0.8854836675495018
Logistic regression using SGD precision: 0.6068186224215949
Logistic regression using SGD recall: 0.51684574036108
Logistic regression using SGD F1 score: 0.4931996188232587

### Interpret Feature Importance - Weighted Coefficients Explanation:
### Just as in the previous logistic regression model, the SVM weighted coefficients indicate that there are number of features with slight either negative or positive correlations with the response variable, however absolute subjectivity level stands out with a negative correlation with the response variable. This means that the more articles with higher subjective title rating are less likely to be popular. The weight for absolute subjectivity level is much larger than the other weighted coefficients which is another indication of how strong the correlation with the response variable is compared to that of other features.

### Weights: 
#### average_token_length has weight of 0.018883633294519746
#### LDA_00 has weight of -0.015301510838099166
#### global_subjectivity has weight of 0.017013692317738614
#### title_subjectivity has weight of -0.031227404893417184
#### abs_title_subjectivity has weight of -448.03103999522546
#### day_of_week has weight of 0.04159595106887105
#### log_num_imgs has weight of -0.06858863999471022


In [33]:
from sklearn.inspection import permutation_importance
from sklearn.linear_model import SGDClassifier
alpha = .0001
fit_intercept = True
l1_ratio = 0.0
learning_rate = 'optimal'
loss = 'log' #use log for logistic regression 
n_iter_no_change = 500
# lasso 
penalty = 'l1'

# Initialize the SVM model.
support_vector_machine_model__log_sgd = SGDClassifier(alpha=alpha,fit_intercept=fit_intercept, l1_ratio=l1_ratio, learning_rate=learning_rate, loss=loss, n_iter_no_change=n_iter_no_change, penalty=penalty)

# Train the support vector machine model on the training set using a linear kernel.
support_vector_machine_model__log_sgd.fit(X_train, y_train)
# Make predictions on the test data.
y_pred = support_vector_machine_model__log_sgd.predict(X_test)

# Get the weights from the trained model.
weights = support_vector_machine_model__log_sgd.coef_.T

# Print the weight of each variable.
for weight, variable_name in zip(weights, df.columns[:-1]):
    print(f'{variable_name} has weight of {weight[0]}')

url_name has weight of 0.0
date has weight of 0.0
timedelta has weight of 0.0
n_tokens_title has weight of 0.0
n_unique_tokens has weight of 0.0
average_token_length has weight of 0.018883633294519746
num_keywords has weight of 0.0
kw_min_min has weight of 0.0
kw_avg_min has weight of 0.0
kw_max_max has weight of 0.0
kw_avg_max has weight of 0.0
kw_min_avg has weight of 0.0
kw_max_avg has weight of 0.0
is_weekend has weight of 0.0
LDA_00 has weight of -0.015301510838099166
LDA_01 has weight of 0.0
LDA_02 has weight of 0.0
LDA_03 has weight of 0.0
LDA_04 has weight of 0.0
global_subjectivity has weight of 0.017013692317738614
global_sentiment_polarity has weight of 0.0
global_rate_positive_words has weight of 0.0
global_rate_negative_words has weight of 0.0
rate_positive_words has weight of 0.0
rate_negative_words has weight of 0.0
avg_positive_polarity has weight of 0.0
min_positive_polarity has weight of 0.0
max_positive_polarity has weight of 0.0
avg_negative_polarity has weight of 0

In [34]:
logistic_regression_accuracy_sgd = accuracy_score(y_test, y_pred)
logistic_regression_precision_sgd = precision_score(y_test, y_pred, average='macro')
logistic_regression_recall_sgd = recall_score(y_test, y_pred, average='macro')
logistic_regression_f1_score_sgd = f1_score(y_test, y_pred, average='macro')
# logistic_regression_roc_auc_score = roc_curve(y_test, y_pred[:, 1])


# Print the metrics.
print('Logistic regression using SGD accuracy:', logistic_regression_accuracy_sgd)
print('Logistic regression using SGD precision:', logistic_regression_precision_sgd)
print('Logistic regression using SGD recall:', logistic_regression_recall_sgd)
print('Logistic regression using SGD F1 score:', logistic_regression_f1_score_sgd)



Logistic regression using SGD accuracy: 0.8854836675495018
Logistic regression using SGD precision: 0.6068186224215949
Logistic regression using SGD recall: 0.51684574036108
Logistic regression using SGD F1 score: 0.4931996188232587


  _warn_prf(average, modifier, msg_start, len(result))


## SUBSETTING -- We need to do this section 

If you used stochastic gradient descent (and therefore did not explicitly solve for support vectors), try subsampling your data to train the SVC model— then analyze the support vectors from the subsampled dataset.

### Advantages/Disadvantages: 
### Does one model type offer superior performance in terms of prediction
### In terms of training time or efficiency? In terms of training and efficiency the logistic regression was faster to train and had better results than the SVM model. Overall the acurracy... 
##### Accuracy: The proportion of predictions that are correct.

##### Precision: The proportion of positive predictions that are correct.

##### Recall: The proportion of positive examples that are correctly identified.

##### ROC:

#### Discuss the advantages of each model for each classification task. Does one type of model offer superior performance over another in terms of prediction accuracy? In terms of training time or efficiency? Explain in detail.


### Use the weights from logistic regression to interpret the importance of different features for each classification task. Explain your interpretation in detail. Why do you think some variables are more important?


we have this info above 


### Look at the chosen support vectors for the classification task. Do these provide any insight into the data? Explain.

### Conclusion 