<a href="https://colab.research.google.com/github/KelseyWalking/ML_road/blob/main/Group_9_project_Kelsey_Try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Group 9 members




*   Athuluri, Vamsi Ram
*   Ing, John
*   Marowa, Noel
*   Shaun, Kelsey
*   Taylor, Richard





### Import Necessary Python Libraries

In [2]:
import os
import gdown
import pandas as pd


In [3]:
filename = '/SMSSpamCollection.csv'
url = 'https://drive.google.com/file/d/1yls6NTKOjgCGsngErqKw975QqR2lGM6B/view?usp=drive_link'
output_path = os.getcwd() + '/' + filename
# Function to download files if not already present
def download_file(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        gdown.download(url, output_path, quiet=False, fuzzy=True)
        print(f"File downloaded to: {output_path}")
    else:
        print(f"{filename} already exists. Skipping download.")

download_file(url, filename)

print("All files are ready.")

Downloading /SMSSpamCollection.csv...


Downloading...
From: https://drive.google.com/uc?id=1yls6NTKOjgCGsngErqKw975QqR2lGM6B
To: /content/SMSSpamCollection.csv
100%|██████████| 478k/478k [00:00<00:00, 75.2MB/s]

File downloaded to: /content//SMSSpamCollection.csv
All files are ready.





In [6]:
# The file is a tab-separated file with no header, based on the standard UCI SMS Spam Collection dataset.
cols = ['label', 'message']
spam_df = pd.read_csv(output_path, sep='\	', header=None, names=cols, encoding='utf-8')

print(spam_df.head())
print(spam_df['label'].value_counts())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4827
spam     747
Name: count, dtype: int64


  spam_df = pd.read_csv(output_path, sep='\	', header=None, names=cols, encoding='utf-8')
  spam_df = pd.read_csv(output_path, sep='\	', header=None, names=cols, encoding='utf-8')


In [8]:
# This cell applies lowercase and basic cleaning, then fits a TfidfVectorizer
# with unigrams and bigrams, using English stopwords, on the existing spam_df.

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Basic cleaning function: lowercase and strip whitespace
def basic_clean(text):
    if isinstance(text, str):
        return text.lower().strip()
    return ""

spam_df['clean_message'] = spam_df['message'].apply(basic_clean)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X_tfidf = vectorizer.fit_transform(spam_df['clean_message'])

print(X_tfidf.shape)

# Show a small sample of feature names
feature_names = vectorizer.get_feature_names_out()[:50]
print(feature_names)

(5574, 37360)
['00' '00 easter' '00 sub' '00 subs' '000' '000 bonus' '000 cash'
 '000 homeowners' '000 pounds' '000 price' '000 prize' '000 xmas' '000pes'
 '000pes 48' '008704050406' '008704050406 sp' '0089' '0089 digits' '0121'
 '0121 2025050' '01223585236' '01223585236 xx' '01223585334'
 '01223585334 cum' '0125698789' '0125698789 ring' '02' '02 06' '02 09'
 '02 claimcode' '02 user' '0207' '0207 083' '0207 153' '02072069400'
 '02072069400 bx' '02073162414' '02073162414 costs' '02085076972'
 '02085076972 reply' '021' '021 3680' '03' '03 05' '03 2nd' '03 final'
 '03 marsms' '04' '04 08717507382' '0430']


In [13]:
spam_df

Unnamed: 0,label,message,clean_message
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."
...,...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s...","pity, * was in mood for that. so...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i'd...


In [14]:
X_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 83520 stored elements and shape (5574, 37360)>

In [9]:
# This cell reduces the TF-IDF matrix to 100 components using TruncatedSVD
# and shows how much variance is explained by these components.

from sklearn.decomposition import TruncatedSVD
import numpy as np

n_components = 100
svd_100 = TruncatedSVD(n_components=n_components, random_state=42)
X_svd_100 = svd_100.fit_transform(X_tfidf)

explained_variance_ratio = svd_100.explained_variance_ratio_

print(X_tfidf.shape)
print(X_svd_100.shape)
print(np.sum(explained_variance_ratio))
print(explained_variance_ratio[:10])

(5574, 37360)
(5574, 100)
0.13391271472416214
[0.00648116 0.00625961 0.00414466 0.00194333 0.00269402 0.00234293
 0.00238844 0.00236674 0.00224885 0.00202245]


In [10]:
# Reduce TF-IDF to 1000 components with TruncatedSVD and show explained variance

from sklearn.decomposition import TruncatedSVD
import numpy as np

n_components_1000 = 1000
svd_1000 = TruncatedSVD(n_components=n_components_1000, random_state=42)
X_svd_1000 = svd_1000.fit_transform(X_tfidf)

explained_variance_ratio_1000 = svd_1000.explained_variance_ratio_

print(X_tfidf.shape)
print(X_svd_1000.shape)
print(np.sum(explained_variance_ratio_1000))
print(explained_variance_ratio_1000[980:1000])

(5574, 37360)
(5574, 1000)
0.4698516469752707
[0.00648116 0.00625961 0.00414466 0.00194335 0.00269401 0.00234289
 0.00238847 0.00236675 0.00224885 0.00202254]


In [11]:
print(explained_variance_ratio_1000[980:1000])

[0.00020422 0.00020386 0.00020378 0.00020341 0.00020309 0.00020287
 0.00020264 0.00020208 0.00020193 0.00020174 0.00020154 0.00020088
 0.00020049 0.00020031 0.00020007 0.00019965 0.00019919 0.00019881
 0.00019862 0.00019832]


In [12]:
# Minimal example: create a fake y if it does not exist, then train models on X_svd_1000
# This is just to show you how to plug X_svd_1000 into a classifier.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# If X_svd_1000 or y are not defined in this notebook, create dummy ones so code runs
try:
    X_svd_1000
except NameError:
    # create a random matrix with 1000 features and 1000 samples
    X_svd_1000 = np.random.randn(1000, 1000)

try:
    y
except NameError:
    # create dummy binary labels spam/ham
    y = np.where(np.random.rand(X_svd_1000.shape[0]) > 0.8, "spam", "ham")

X_train, X_test, y_train, y_test = train_test_split(
    X_svd_1000, y, test_size=0.2, random_state=42, stratify=y
)

log_clf = LogisticRegression(max_iter=1000)
log_clf.fit(X_train, y_train)
log_pred = log_clf.predict(X_test)

print("Logistic Regression (SVD-1000) accuracy:", accuracy_score(y_test, log_pred))
print("Logistic Regression (SVD-1000) F1 (spam as positive):",
      f1_score(y_test, log_pred, pos_label="spam"))

svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)
svm_pred = svm_clf.predict(X_test)

print("Linear SVM (SVD-1000) accuracy:", accuracy_score(y_test, svm_pred))
print("Linear SVM (SVD-1000) F1 (spam as positive):",
      f1_score(y_test, svm_pred, pos_label="spam"))

Logistic Regression (SVD-1000) accuracy: 0.8044843049327354
Logistic Regression (SVD-1000) F1 (spam as positive): 0.0
Linear SVM (SVD-1000) accuracy: 0.7677130044843049
Linear SVM (SVD-1000) F1 (spam as positive): 0.05818181818181818
