In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [24]:
# Define Classes and Functions

# Logistic Regression
class LogisticRegressionModel:
    def __init__(self, C=1.0):
        self.model = LogisticRegression(C=C)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

# Support Vector Machine
class SupportVectorMachineModel:
    def __init__(self, C=1.0):
        self.model = SVC(C=C, kernel='linear')

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

# Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    accuracy = np.mean(y_pred == y_test)
    precision = np.mean(y_pred[y_test == 1] == 1)
    recall = np.mean(y_pred == y_test) if np.sum(y_test == 1) > 0 else 0
    # F1 score: A harmonic mean of precision and recall.
    f1_score = 2 * (precision * recall) / (precision + recall)

    return accuracy, precision, recall, f1_score

# Stochastic Gradient Descent
class StochasticGradientDescent:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.weights = None

    def train(self, X, y, epochs=1000):
        # Initialize the weights
        self.weights = np.random.randn(X.shape[1])

        # Iterate over the training data in batches
        for epoch in range(epochs):
            for i in range(X.shape[0]):
                # Get a batch of data
                x_batch = X[i:i + 1, :]
                y_batch = y[i:i + 1]

                # Compute the predictions
                y_pred = self.predict(x_batch)

                # Compute the gradients of the loss function with respect to the model parameters
                gradients = np.dot(x_batch.T, y_pred - y_batch)

                # Update the model parameters using the gradients and a learning rate
                self.weights -= self.learning_rate * gradients

    def predict(self, X):
        # Compute the predictions
        y_pred = 1 / (1 + np.exp(-np.dot(X, self.weights)))

        return y_pred

In [11]:
# file path
filepath = "../1 - Visualization and Data Preprocessing/Data/ONPClean2.csv"
# Load the dataset
df = pd.read_csv(filepath)

df.head()

Unnamed: 0,url_name,date,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,...,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,amazon-instant-video-browser/,2013-01-07,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,...,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,reeddit-reddit/,2013-01-07,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,...,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,rage-comics-dying/,2013-01-07,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,...,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,power-matters-alliance-organization/,2013-01-07,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,...,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,polaroid-android-camera/,2013-01-07,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,...,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


In [20]:
# drop certain column
df1 = df.drop('url_name', axis=1) # was a string
df1 = df1.drop('date', axis=1) # datetime change didnt even work.
df1 = df1.drop('day_of_week', axis=1) # other categorical variable


# Change the data type of the column containing the date value to datetime
# df1['date'] = pd.to_datetime(df1['date']) # datetime data type doesnt work

df1.head()

Unnamed: 0,timedelta,n_tokens_title,n_unique_tokens,average_token_length,num_keywords,kw_min_min,kw_avg_min,kw_max_max,kw_avg_max,kw_min_avg,...,log_num_hrefs,log_num_self_hrefs,log_num_imgs,log_num_videos,log_kw_max_min,log_kw_min_max,log_kw_avg_avg,log_self_reference_min_shares,log_self_reference_max_shares,log_self_reference_avg_sharess
0,731.0,12.0,0.663594,4.680365,5.0,0.0,0.0,0.0,0.0,0.0,...,1.609438,1.098612,0.693147,0.0,0.0,0.0,0.0,6.20859,6.20859,6.20859
1,731.0,8.0,0.821705,4.546154,9.0,0.0,0.0,0.0,0.0,0.0,...,2.079442,1.609438,0.0,0.0,0.0,0.0,0.0,7.170888,7.170888,7.170888
2,731.0,9.0,0.608602,4.759494,7.0,0.0,0.0,0.0,0.0,0.0,...,2.484907,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,731.0,10.0,0.53539,5.147748,10.0,0.0,0.0,0.0,0.0,0.0,...,2.079442,1.94591,0.693147,0.0,0.0,0.0,0.0,7.550135,7.550135,7.550135
4,731.0,9.0,0.424132,4.63139,8.0,0.0,0.0,0.0,0.0,0.0,...,3.091042,3.091042,3.044522,0.0,0.0,0.0,0.0,6.302619,9.680406,8.140199


In [21]:
# Split the dataset into 80% training and 20% testing sets.
X = df1.drop('news_category', axis=1)
y = df1['news_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Scale the features in the training and testing sets.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Logistic Regression Model

In [25]:
# Create the logistic regression model
logistic_regression_model = LogisticRegressionModel()

# Train the logistic regression model on the training set.
logistic_regression_model.train(X_train, y_train)

# Evaluate the model on the testing set to assess its performance.
logistic_regression_accuracy, logistic_regression_precision, logistic_regression_recall, logistic_regression_f1_score = evaluate_model(logistic_regression_model, X_test, y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [22]:
# Adjust the parameters of the model, such as the regularization parameter, to improve its performance.

# Create the logistic regression model with a regularization parameter of 0.5
# logistic_regression_model = LogisticRegressionModel(C=0.5)

# # Train the model on the training set
# logistic_regression_model.train(X_train, y_train)

# # Evaluate the model on the testing set
# logistic_regression_accuracy, logistic_regression_precision, logistic_regression_recall, logistic_regression_f1_score = evaluate_model(logistic_regression_model, X_test, y_test)

# # Print the evaluation results
# print('Logistic regression:')
# print('Accuracy:', logistic_regression_accuracy)
# print('Precision:', logistic_regression_precision)
# print('Recall:', logistic_regression_recall)
# print('F1 score:', logistic_regression_f1_score)

SyntaxError: invalid syntax (959081433.py, line 5)

### Support Vector Machine Model

In [29]:
# Create the support vector machine model
support_vector_machine_model = SupportVectorMachineModel()

# Train the support vector machine model on the training set using a linear kernel.
support_vector_machine_model.train(X_train, y_train)

# Evaluate the model on the testing set to assess its performance.
support_vector_machine_accuracy, support_vector_machine_precision, support_vector_machine_recall, support_vector_machine_f1_score = evaluate_model(support_vector_machine_model, X_test, y_test)

# Adjust the parameters of the model, such as the regularization parameter, to improve its performance.


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


### Adjusting parameters

To adjust the parameters of the logistic regression and support vector machine models, I would use a grid search approach. This involves training the models with a range of different parameter values and evaluating their performance on the testing set. 
The parameter values that produce the best performance on the testing set would then be selected as the final parameters for the model.

### Stochastic Gradient Descent

In [38]:
# Stochastic Gradient Descent
class StochasticGradientDescent:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.weights = None
        
    # Initialize the model parameters to random values.
    def train(self, X, y, epochs=1000):
        # Initialize the weights
        self.weights = np.random.randn(X.shape[1])

        # Iterate over the training data in batches
        for epoch in range(epochs):
            for i in range(X.shape[0]):
                # Get a batch of data
                x_batch = X[i:i + 1, :]
                y_batch = y[i:i + 1]

                # Compute the predictions
                y_pred = self.predict(x_batch)

                # Compute the gradients of the loss function with respect to the model parameters
                gradients = np.dot(x_batch.T, y_pred - y_batch)

                # Update the model parameters using the gradients and a learning rate
                self.weights -= self.learning_rate * gradients

    def predict(self, X):
        # Compute the predictions
        y_pred = 1 / (1 + np.exp(-np.dot(X, self.weights)))

        return y_pred
    
#     def compute_gradients(self, X_train, y_train):
#         # Compute the predictions
#         y_pred = self.predict(X_train)

#         # Compute the gradients of the loss function with respect to the model parameters
#         gradients = np.dot(X_train.T, y_pred - y_train)

#         return gradients


In [39]:
# Create the logistic regression model
SGD = StochasticGradientDescent()

# Train the model
SGD.train(X, y)

# Make predictions
y_pred = SGD.predict(X)

# Print the predictions
print(y_pred)

TypeError: '(slice(0, 1, None), slice(None, None, None))' is an invalid key

### Evaluating model performance

##### Accuracy: The proportion of predictions that are correct.

##### Precision: The proportion of positive predictions that are correct.

##### Recall: The proportion of positive examples that are correctly identified.

##### ROC:

In [35]:
# Print the evaluation results
print('Logistic regression:')
print('Accuracy:', logistic_regression_accuracy)
print('Precision:', logistic_regression_precision)
print('Recall:', logistic_regression_recall)
print('F1 score:', logistic_regression_f1_score)
# print('ROC:', logistic_regression_roc)

print('')

print('Support vector machine:')
print('Accuracy:', support_vector_machine_accuracy)
print('Precision:', support_vector_machine_precision)
print('Recall:', support_vector_machine_recall)
print('F1 score:', support_vector_machine_f1_score)
# print('ROC:', support_vector_machine_roc)

Logistic regression:
Accuracy: 0.7868583680161433
Precision: nan
Recall: 0
F1 score: nan

Support vector machine:
Accuracy: 0.7956867196367764
Precision: nan
Recall: 0
F1 score: nan


From Google Bard:

Which model to use?

The best model to use for a particular classification task will depend on the specific characteristics of the data. Logistic regression is a good choice for tasks where the data is linearly separable and the dataset size is not too large. Support vector machines are a good choice for tasks where the data is not linearly separable, the dataset size is large, or there are outliers in the data.

It is also important to note that support vector machines can be more computationally expensive to train than logistic regression models.

Conclusion

To create a logistic regression model and a support vector machine model for the classification task involved with my dataset, I would follow the steps outlined above. I would then adjust the parameters of the models to improve their performance and evaluate their performance on the testing set. The model with the best performance on the testing set would then be selected as the final model.