In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [None]:
df_train = pd.read_csv("dataset/train.csv")
df_test = pd.read_csv("dataset/test.csv")

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

In [None]:
display(df_train.describe())
display(df_test.describe())

In [None]:
display(df_train.head())
display(df_test.head())

In [None]:
df_train.sample(5).text

In [None]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 2
# Start with non-tensorflow approaches
# AUTO = tf.data.experimental.AUTOTUNE 

In [None]:
# Train, test, validation split
X = df_train["text"]
y = df_train["target"]

# If 42 is the answer, then 42*42 is the seed
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42*42)

X_test = df_test["text"] # Test data unknown

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Vectorize the training dataset text
# if it's fun to say, you know it's good
# tiffeediff
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', norm='l2')
X_train_vectorized = vectorizer.fit_transform(X_train) # ALTERED: Vectorizing JUST the training data rather than the full dataset

# Perform PCA analysis
pca = PCA(n_components=4)
X_train_pca = pca.fit_transform(X_train_vectorized.toarray())

# Print the explained variance ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)


In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
# Plot the PCA components in a seaborn pairplot
import seaborn as sns

# Create a dataframe from the PCA components
df_pca = pd.DataFrame(X_train_pca, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4'])
df_pca['target'] = df_train['target']

# Plot the pairplot
sns.pairplot(df_pca, hue='target')

In [None]:
# Find the words that are most correlated with the target variable

# Create a dataframe from the vectorized text
# sample = pd.DataFrame(X_train_vectorized.toarray()).sample(1000)
df_vectorized = pd.DataFrame(X_train_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
# df_vectorized['target'] = df_train['target']

df_vec_sample = df_vectorized.sample(5000)

# Calculate the correlation matrix
# correlation_matrix = df_vec_sample.corr()
correlation_series = df_vec_sample.corrwith(df_train['target'])

In [None]:
# Find the words that are most correlated with the target variable
# correlation_target = correlation_matrix['target']
# correlation_target = correlation_target.drop('target')
# correlation_target = correlation_target.sort_values(ascending=False).iloc[:10]

# # Print the most correlated words
# print(correlation_target)
correlation_series.sort_values(ascending=False).iloc[:25]

In [None]:
# Vectorize and Dimensionality Reduce the Validation Data
X_val_vectorized = vectorizer.transform(X_val)
X_val_pca = pca.transform(X_val_vectorized.toarray())

In [None]:
# Train a basic SVC model
svc = SVC()
svc.fit(X_train_pca, y_train)
y_pred = svc.predict(X_val_pca)

# For a fancy SVC model, tune the hyperparameters of the SVC model
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': np.logspace(-2, 2, 5),  # 0.01 to 100
    'gamma': np.logspace(-4, 1, 6),  # 0.0001 to 10
    # 'class_weight': [None, 'balanced']    # Uncomment if the dataset seems imbalanced (lots of 0s or 1s)
}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1)
grid.fit(X_train_pca, y_train)
print("Best Parameters:", grid.best_params_)

# Train the new model with the best hyperparameters, compare to old SVC model
svc_best = SVC(**grid.best_params_) # Double asterisk is to unpack best_params_ dictionary. DON'T REMOVE!
svc_best.fit(X_train_pca, y_train)
y_pred_best = svc_best.predict(X_val_pca)

# Print both SVC accuracies
print("Old SVC Accuracy:", accuracy_score(y_val, y_pred))
print("New SVC Accuracy:", accuracy_score(y_val, y_pred_best))

# It's like 1% better, but it's better! And it likely generalizes to the test set better
