In [None]:
# Week 3 – Task 1: Semi-Supervised Learning — SMS Spam Detection

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# Preprocess Text

import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')

# Clean text
def clean_text(text):
    text = text.lower()
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned'] = df['message'].apply(clean_text)

# Convert labels to numbers
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# TF-IDF Vectorization

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['cleaned'])
y = df['label_num'].copy()


In [4]:
# Semi-Supervised Setup

import numpy as np
from sklearn.semi_supervised import LabelSpreading

# Use only 20% labeled data
rng = np.random.RandomState(42)
n_total = len(y)
n_labeled = int(n_total * 0.2)

# Randomly mask 80% of labels
indices = np.arange(n_total)
rng.shuffle(indices)

y_semi = np.copy(y)
y_semi[indices[n_labeled:]] = -1  # unlabeled portion

# Apply Label Spreading
model = LabelSpreading(kernel='knn', n_neighbors=5)
model.fit(X, y_semi)

# Predict labels
y_pred = model.transduction_

In [5]:
# Evaluate

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate only on the originally labeled set
y_true_eval = y[indices[:n_labeled]]
y_pred_eval = y_pred[indices[:n_labeled]]

print("Accuracy:", accuracy_score(y_true_eval, y_pred_eval))
print("Precision:", precision_score(y_true_eval, y_pred_eval))
print("Recall:", recall_score(y_true_eval, y_pred_eval))
print("F1 Score:", f1_score(y_true_eval, y_pred_eval))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [None]:
# Week 3 – Task 2: Song Genre Classification using Audio Features

# *Since your dataset doesn't contain a genre column, but it does have a popularity column, we can treat popularity as the target variable for a classification task by converting it into popularity levels (e.g., low, medium, high).*

In [11]:
import pandas as pd

df = pd.read_csv("SpotifyAudioFeaturesApril2019.csv")
df.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


In [13]:
# Drop non-feature columns
df = df.drop(columns=['artist_name', 'track_id', 'track_name'])

# Create a popularity class (low=0, medium=1, high=2)
def popularity_label(val):
    if val <= 30:
        return 'low'
    elif val <= 60:
        return 'medium'
    else:
        return 'high'

df['popularity_label'] = df['popularity'].apply(popularity_label)

# Drop the original popularity column
df = df.drop(columns=['popularity'])

# Handle nulls (if any)
df.dropna(inplace=True)

# Encode target labels
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()
df['popularity_encoded'] = le.fit_transform(df['popularity_label'])

# Split features and target
X = df.drop(columns=['popularity_label', 'popularity_encoded'])
y = df['popularity_encoded']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split and Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Confusion Matrix:
 [[   42   902   352]
 [    5 15683  1370]
 [    8  6235  1536]]

Classification Report:
               precision    recall  f1-score   support

        high       0.76      0.03      0.06      1296
         low       0.69      0.92      0.79     17058
      medium       0.47      0.20      0.28      7779

    accuracy                           0.66     26133
   macro avg       0.64      0.38      0.38     26133
weighted avg       0.63      0.66      0.60     26133

