In [2]:
import pandas as pd
import zipfile
import joblib
import io
import requests
from src.data_preprocessing import preprocess_df
from src.feature_extraction import extract_features
from src.model import train_model
from src.evaluation import evaluate_model


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

# Extract the specific file (SMSSpamCollection)
with zip_file.open('SMSSpamCollection') as file:
    df = pd.read_csv(file, sep='\t', names=['label', 'message'])

print(df.head())

print(df.info())
print(df['label'].value_counts())

df =  preprocess_df(df)

print(df.head())

X, vectorizer = extract_features(df['message'])

y = df['label'].map({'ham': 0, 'spam': 1})

print(X.shape)

model, accuracy, conf_matrix, class_report = train_model(X, y)

# Save the model
model_path = "../models/logistic_regression.pkl"
joblib.dump(model, model_path)

# Load the model
loaded_model = joblib.load(model_path)

# Predict using the loaded model
new_predictions = loaded_model.predict(X_test)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)



  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
label
ham     4825
spam     747
Name: count, dtype: int64
  label                                            message
0   ham  go jurong point, crazy.. available bugis n gre...
1   ham                      ok lar... joking wif u oni...
2  spam  free entry 2 wkly comp win fa cup final tkts 2...
3   ham          u dun say early hor... u c already 