In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [2]:
# Define Classes and Functions

# # Custom Logistic Regression
# class LogisticRegressionModel:
#     def __init__(self, C=0.05, penalty='l1', solver='saga'):
#         self.model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=10000)

#     def train(self, X, y, epochs=1000):
#         self.model.fit(X, y)

#     def predict(self, X):
#         return self.model.predict(X)

#     def score(self, X, y):
#         # Calculate the accuracy of the model on the given data set.
#         accuracy = np.mean(self.predict(X) == y)

#         return accuracy

# # Custom Support Vector Machine
# class SupportVectorMachineModel:
#     def __init__(self, C=1.0):
#         self.model = SVC(C=C, kernel='linear')

#     def train(self, X_train, y_train):
#         self.model.fit(X_train, y_train)

#     def predict(self, X_test):
#         return self.model.predict(X_test)

# # Custom Evaluation model
# def evaluate_model(model, X_test, y_test):
#     y_pred = model.predict(X_test)

#     accuracy = np.mean(y_pred == y_test)
#     precision = np.mean(y_pred[y_test == 1] == 1)
#     recall = np.mean(y_pred == y_test) if np.sum(y_test == 1) > 0 else 0
#     # F1 score: A harmonic mean of precision and recall.
#     f1_score = 2 * (precision * recall) / (precision + recall)

#     return accuracy, precision, recall, f1_score

# # Custom Stochastic Gradient Descent
# class StochasticGradientDescent:
#     def __init__(self, learning_rate=0.01):
#         self.learning_rate = learning_rate
#         self.weights = None
        
#     # Initialize the model parameters to random values.
#     def train(self, X, y, epochs=1000):
#         # Initialize the weights
#         self.weights = np.random.randn(X.shape[1])

#         # Iterate over the training data in batches
#         for epoch in range(epochs):
#             for i in range(X.shape[0]):
#                 # Get a batch of data
#                 x_batch = X[i:i + 1, :]
#                 y_batch = y[i:i + 1]

#                 # Compute the predictions
#                 y_pred = self.predict(x_batch)

#                 # Compute the gradients of the loss function with respect to the model parameters
#                 gradients = np.dot(x_batch.T, y_pred - y_batch)

#                 # Update the model parameters using the gradients and a learning rate
#                 self.weights -= self.learning_rate * gradients

#     def predict(self, X):
#         # Compute the predictions
#         y_pred = 1 / (1 + np.exp(-np.dot(X, self.weights)))

#         return y_pred
    
#     def compute_gradients(self, X_train, y_train):
#         # Compute the predictions
#         y_pred = self.predict(X_train)

#         # Compute the gradients of the loss function with respect to the model parameters
#         gradients = np.dot(X_train.T, y_pred - y_train)

#         return gradients

## Data Preparation

In [3]:
# file path
filepath = "../1 - Visualization and Data Preprocessing/Data/ONPClean2.csv"
# Load the dataset
df = pd.read_csv(filepath)

df.head()

Unnamed: 0,url_name,date,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,...,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,amazon-instant-video-browser/,2013-01-07,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,...,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,reeddit-reddit/,2013-01-07,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,...,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,rage-comics-dying/,2013-01-07,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,...,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,power-matters-alliance-organization/,2013-01-07,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,...,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,polaroid-android-camera/,2013-01-07,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,...,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


In [4]:
# drop certain columns
df1 = df.drop('url_name', axis=1) # was a string
df1 = df1.drop('date', axis=1) # datetime change didnt even work.
df1 = df1.drop('day_of_week', axis=1) # other categorical variable
df1 = df1.drop('news_category', axis=1) # other categorical variable

df1.head()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,...,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,...,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,...,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,...,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,...,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,...,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


In [5]:
# Describe the `shares` column
df1['shares'].describe()

count     39644.000000
mean       3395.380184
std       11626.950749
min           1.000000
25%         946.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [6]:
# Create a new column called `share_ranges` with categorical levels
df1['share_ranges'] = pd.cut(df1['shares'], bins=[0, 2500, 5000, 7500, 10000, 20000, 100000, 1000000], labels=['<2500', '>2500 & <5000', '>5000 & <7500', '>7500 & <10000', '>10000 & <20000', '>20000 & <100000', '>100000'])

# Print the DataFrame
df1.head()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,...,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess,share_ranges
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859,<2500
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,...,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888,<2500
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<2500
3,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,...,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135,<2500
4,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,...,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199,<2500


In [7]:
# Describe the `shares` column
# df1['share_ranges'].value_counts()

In [8]:
df1.dropna()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,...,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess,share_ranges
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.000000,0.0,0.000000,0.000000,...,1.098612,0.693147,0.000000,0.000000,0.000000,0.000000,6.208590,6.208590,6.208590,<2500
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.000000,0.0,0.000000,0.000000,...,1.609438,0.000000,0.000000,0.000000,0.000000,0.000000,7.170888,7.170888,7.170888,<2500
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,<2500
3,731.0,10.0,0.535390,5.147748,10.0,0.0,0.000000,0.0,0.000000,0.000000,...,1.945910,0.693147,0.000000,0.000000,0.000000,0.000000,7.550135,7.550135,7.550135,<2500
4,731.0,9.0,0.424132,4.631390,8.0,0.0,0.000000,0.0,0.000000,0.000000,...,3.091042,3.044522,0.000000,0.000000,0.000000,0.000000,6.302619,9.680406,8.140199,<2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,9.0,12.0,0.567227,4.313253,5.0,-1.0,42.600000,843300.0,571200.000000,2170.324903,...,1.098612,0.693147,0.693147,5.384495,9.994288,7.967156,8.071219,9.179984,8.771990,<2500
39640,9.0,13.0,0.570136,4.589286,10.0,-1.0,511.000000,843300.0,310130.000000,1500.000000,...,2.079442,0.693147,0.693147,8.131825,7.313887,7.946497,7.003974,7.550135,7.424165,<2500
39641,9.0,12.0,0.514925,4.263403,7.0,-1.0,525.000000,843300.0,224885.714286,1880.000000,...,1.386294,1.386294,0.000000,7.378384,8.366603,8.083845,7.170888,7.170888,7.170888,>2500 & <5000
39642,9.0,15.0,0.506261,5.005172,7.0,-1.0,88.857143,843300.0,266628.571429,1558.755814,...,1.098612,1.386294,0.000000,6.104793,9.752723,7.912769,7.244942,7.244942,7.244942,<2500


In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 49 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   timedelta                       39644 non-null  float64 
 1   n_tokens_title                  39644 non-null  float64 
 2   n_unique_tokens                 39644 non-null  float64 
 3   average_token_length            39644 non-null  float64 
 4   num_keywords                    39644 non-null  float64 
 5   kw_min_min                      39644 non-null  float64 
 6   kw_avg_min                      39644 non-null  float64 
 7   kw_max_max                      39644 non-null  float64 
 8   kw_avg_max                      39644 non-null  float64 
 9   kw_min_avg                      39644 non-null  float64 
 10  kw_max_avg                      39644 non-null  float64 
 11  is_weekend                      39644 non-null  float64 
 12  LDA_00            

In [10]:
# Split the dataset into 80% training and 20% testing sets.
X = df1.drop('share_ranges', axis=1)
y = df1['share_ranges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Scale the features in the training and testing sets.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Check for NaNs in the dataset
for column in df1.columns:
    if df1[column].isnull().any():
        print('NaNs found in column:', column)

### Logistic Regression Model

In [12]:
# Create the logistic regression model
# logistic_regression_model = LogisticRegressionModel()

# # Train the logistic regression model on the training set.
# logistic_regression_model.train(X_train, y_train)

# # Calculate the cross-validated accuracy of the model.
# logistic_regression_cross_validated_accuracy = cross_val_score(logistic_regression_model, X_train, y_train, cv=5)

# # Print the cross-validated accuracy of the model.
# print('Logistic regression cross-validated accuracy:', logistic_regression_cross_validated_accuracy.mean())

In [13]:
# Set the model parameters.
C = 0.05
penalty = 'l1'
solver = 'saga'
max_iter=10000

# Create a logistic regression model.
logistic_regression_model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=max_iter)

# Fit the model to the training data.
logistic_regression_model.fit(X_train, y_train)

# Make predictions on the test data.
y_pred = logistic_regression_model.predict(X_test)

LogisticRegression(C=0.05, max_iter=10000, penalty='l1', solver='saga')

In [14]:
# # Do cross validation
# logistic_regression_cross_validated_accuracy = cross_val_score(logistic_regression_model, X_train, y_train, cv=5)

# # Check accuracy from cross validation
# print('Logistic regression cross-validated accuracy:', logistic_regression_cross_validated_accuracy.mean())

In [15]:
# # Plot the cross-validated accuracy.
# plt.plot(logistic_regression_cross_validated_accuracy)

# # Fit a line to the cross-validated accuracy.
# # line, axes = plt.plot(logistic_regression_cross_validated_accuracy, 'r-')

# # Set the title and labels of the plot.
# plt.title('Logistic Regression Cross-Validated Accuracy')
# plt.xlabel('Cross-Validation Chunk')
# plt.ylabel('Accuracy')

# # Add a label to the fit line.
# # line.set_label('C=0.05, penalty=\'l1\', solver=\'saga\'')

# # Show the plot.
# plt.legend()
# plt.show()

In [29]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data.
logistic_regression_accuracy = accuracy_score(y_test, y_pred)
logistic_regression_precision = precision_score(y_test, y_pred, average='macro')
logistic_regression_recall = recall_score(y_test, y_pred, average='macro')
logistic_regression_f1_score = f1_score(y_test, y_pred, average='macro')
# logistic_regression_roc_auc_score = roc_curve(y_test, y_pred[:, 1])


# Print the metrics.
print('Logistic regression accuracy:', logistic_regression_accuracy)
print('Logistic regression precision:', logistic_regression_precision)
print('Logistic regression recall:', logistic_regression_recall)
print('Logistic regression F1 score:', logistic_regression_f1_score)
# print('Logisticregression ROC AUC score:', logistic_regression_roc_auc_score)

Logistic regression accuracy: 0.9713709168873754
Logistic regression precision: 0.9253480421239193
Logistic regression recall: 0.8188545675735109
Logistic regression F1 score: 0.8354224838309382


In [31]:
# # Convert the y_pred array to a 2-dimensional array.
# y_pred_2d = y_pred[:, 2]

# # Calculate the ROC curve.
# fpr, tpr, thresholds = roc_curve(y_test, y_pred_2d)

# # Plot the ROC curve.
# plt.plot(fpr, tpr)
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.show()

### Support Vector Machine Model

In [32]:
# Create the support vector machine model
support_vector_machine_model = SVC()

# Train the support vector machine model on the training set using a linear kernel.
support_vector_machine_model.fit(X_train, y_train)

# Make predictions on the test data.
y_pred = support_vector_machine_model.predict(X_test)

SVC()

In [None]:
# # Do cross validation
# support_vector_machine_cross_validated_accuracy = cross_val_score(support_vector_machine_model, X_train, y_train, cv=5)

# # Check accuracy from cross validation
# print('SVM cross-validated accuracy:', support_vector_machine_cross_validated_accuracy.mean())

In [None]:
# # Plot the cross-validated accuracy.
# plt.plot(support_vector_machine_cross_validated_accuracy)

# # Fit a line to the cross-validated accuracy.
# # line, axes = plt.plot(logistic_regression_cross_validated_accuracy, 'r-')

# # Set the title and labels of the plot.
# plt.title('SVM Cross-Validated Accuracy')
# plt.xlabel('Cross-Validation Chunk')
# plt.ylabel('Accuracy')

# # Add a label to the fit line.
# # line.set_label('C=0.05, penalty=\'l1\', solver=\'saga\'')

# # Show the plot.
# plt.legend()
# plt.show()

In [34]:
# Calculate the accuracy, precision, recall, and F1 score of the model on the test data.
support_vector_machine_accuracy = accuracy_score(y_test, y_pred)
support_vector_machine_precision = precision_score(y_test, y_pred, average='macro')
support_vector_machine_recall = recall_score(y_test, y_pred, average='macro')
support_vector_machine_f1_score = f1_score(y_test, y_pred, average='macro')
# support_vector_machine_roc_auc_score = roc_auc_score(y_test, y_pred, average='macro')

# Print the metrics.
print('Support vector machine accuracy:', support_vector_machine_accuracy)
print('Support vector machine precision:', support_vector_machine_precision)
print('Support vector machine recall:', support_vector_machine_recall)
print('Support vector machine F1 score:', support_vector_machine_f1_score)
# print('Support vector machine ROC AUC score:', support_vector_machine_roc_auc_score)

Support vector machine accuracy: 0.9509395888510531
Support vector machine precision: 0.8921229820194464
Support vector machine recall: 0.8176104771864495
Support vector machine F1 score: 0.849605018971621


### Adjusting parameters

To adjust the parameters of the logistic regression and support vector machine models, I would use a grid search approach. This involves training the models with a range of different parameter values and evaluating their performance on the testing set. 
The parameter values that produce the best performance on the testing set would then be selected as the final parameters for the model.

### Stochastic Gradient Descent

In [None]:
# # Create the logistic regression model
# SGD = StochasticGradientDescent()

# # Train the model
# SGD.train(X, y)

# # Make predictions
# y_pred = SGD.predict(X)

# # Print the predictions
# print(y_pred)

In [35]:
from sklearn.linear_model import SGDClassifier

regularize_const = 0.1
iterations = 5
svm_sgd = SGDClassifier(alpha=regularize_const,
        fit_intercept=True, l1_ratio=0.0, learning_rate='optimal',
        loss='hinge', n_iter_no_change=iterations, n_jobs=-1, penalty='l2')

scl = StandardScaler()
for train_idx, test_idx in cv.split(X,y):
    svm_sgd.fit(scl.fit_transform(X[train_idx]),y[train_idx])
    yhat = svm_sgd.predict(scl.transform(X[test_idx]))

    conf = mt.confusion_matrix(y[test_idx],yhat)
    acc = mt.accuracy_score(y[test_idx],yhat)

print('SVM:', acc)

NameError: name 'cv' is not defined

### Evaluating model performance

##### Accuracy: The proportion of predictions that are correct.

##### Precision: The proportion of positive predictions that are correct.

##### Recall: The proportion of positive examples that are correctly identified.

##### ROC:

#### Discuss the advantages of each model for each classification task. Does one type of model offer superior performance over another in terms of prediction accuracy? In terms of training time or efficiency? Explain in detail.

The Logistic Regression model takes longer to train than the SVM model
Logistic Regression showed better numbers across each of the metrics recorded. In some cases this difference is very small.
Accuracy was beter by ~0.02, precision by ~0.03, recall by ~0.001, and an F1 score difference of ~0.015.

#### Use the weights from logistic regression to interpret the importance of different
features for each classification task. Explain your interpretation in detail. Why do you think
some variables are more important?

#### Look at the chosen support vectors for the classification task. Do these provide
any insight into the data? Explain.

##### From Google Bard:

Which model to use?

The best model to use for a particular classification task will depend on the specific characteristics of the data. Logistic regression is a good choice for tasks where the data is linearly separable and the dataset size is not too large. Support vector machines are a good choice for tasks where the data is not linearly separable, the dataset size is large, or there are outliers in the data.

It is also important to note that support vector machines can be more computationally expensive to train than logistic regression models.

Conclusion

To create a logistic regression model and a support vector machine model for the classification task involved with my dataset, I would follow the steps outlined above. I would then adjust the parameters of the models to improve their performance and evaluate their performance on the testing set. The model with the best performance on the testing set would then be selected as the final model.
#####################################################################