In [None]:
# Q1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier
from nltk.corpus import stopwords
import string
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
df=pd.read_csv('/mnt/data/spam.csv',encoding='latin-1')[['v1','v2']].rename(columns={'v1':'label','v2':'text'})
df['label']=df['label'].map({'spam':1,'ham':0})
sw=set(stopwords.words('english'))
def clean(t):
    t=str(t).lower()
    t=''.join(ch for ch in t if ch not in string.punctuation)
    return ' '.join(w for w in t.split() if w not in sw)
df['text']=df['text'].apply(clean)
X=df['text'].values
y=df['label'].values
tf=TfidfVectorizer()
Xv=tf.fit_transform(X)
Xtr,Xte,ytr,yte=train_test_split(Xv,y,test_size=0.2,random_state=42,stratify=y)
print("Class distribution (train):",np.bincount(ytr))
stump=DecisionTreeClassifier(max_depth=1,random_state=42)
stump.fit(Xtr,ytr)
p_tr=stump.predict(Xtr)
p_te=stump.predict(Xte)
print("Stump Train Acc",accuracy_score(ytr,p_tr),"Test Acc",accuracy_score(yte,p_te))
print("Stump Confusion Matrix\n",confusion_matrix(yte,p_te))
print("Stump Classification Report\n",classification_report(yte,p_te))
T=15
n=Xtr.shape[0]
w=np.ones(n)/n
train_errors=[]
alphas=[]
models=[]
for t in range(1,T+1):
    clf=DecisionTreeClassifier(max_depth=1,random_state=42)
    clf.fit(Xtr,ytr,sample_weight=w)
    pred=clf.predict(Xtr)
    miss=(pred!=ytr).astype(int)
    err=np.dot(w,miss)/w.sum()
    if err==0:
        alpha=0.5*np.log((1-1e-12)/(1e-12))
    else:
        alpha=0.5*np.log((1-err)/max(err,1e-12))
    w=w*np.exp(alpha*miss*2- alpha*(1-miss)*2) # alternative update to emphasize misclassified
    w=w/w.sum()
    models.append((clf,alpha))
    train_pred=np.sign(sum(a* (m.predict(Xtr)*2-1) for m,a in models))
    train_pred=(train_pred+1)//2
    train_acc=accuracy_score(ytr,train_pred)
    train_errors.append(err)
    alphas.append(alpha)
    mis_idx=np.where(miss==1)[0]
    print("Iteration",t)
    print("Misclassified indices (sampled subset)",mis_idx[:20])
    print("Weights of misclassified (sampled subset)",np.round(w[mis_idx[:20]],6))
    print("Alpha",alpha)
plt.figure()
plt.plot(range(1,T+1),train_errors,marker='o')
plt.xlabel('Iteration')
plt.ylabel('Weighted error')
plt.title('Iteration vs weighted error')
plt.show()
plt.figure()
plt.plot(range(1,T+1),alphas,marker='o')
plt.xlabel('Iteration')
plt.ylabel('Alpha')
plt.title('Iteration vs alpha')
plt.show()
def predict_ensemble(models,X):
    s=sum(a*(m.predict(X)*2-1) for m,a in models)
    return ((np.sign(s)+1)//2).astype(int)
train_final=predict_ensemble(models,Xtr)
test_final=predict_ensemble(models,Xte)
print("Manual AdaBoost Train Acc",accuracy_score(ytr,train_final),"Test Acc",accuracy_score(yte,test_final))
print("Manual AdaBoost Confusion Matrix\n",confusion_matrix(yte,test_final))
skada=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=100,learning_rate=0.6,random_state=42)
skada.fit(Xtr,ytr)
print("Sklearn AdaBoost Train",skada.score(Xtr,ytr),"Test",skada.score(Xte,yte))
print("Sklearn Confusion Matrix\n",confusion_matrix(yte,skada.predict(Xte)))




In [None]:
# Q2
import sklearn
from sklearn.datasets import load_heart_disease
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
hd=load_heart_disease()
X=pd.DataFrame(hd.data,columns=hd.feature_names)
y=hd.target
cat_cols=[c for c in X.columns if X[c].nunique()<=6]
num_cols=[c for c in X.columns if c not in cat_cols]
ct=ColumnTransformer([('o',OneHotEncoder(handle_unknown='ignore'),cat_cols),
                      ('s',StandardScaler(),num_cols)])
Xt=ct.fit_transform(X)
Xtr,Xte,ytr,yte=train_test_split(Xt,y,test_size=0.2,random_state=42,stratify=y)
stump=DecisionTreeClassifier(max_depth=1,random_state=42)
stump.fit(Xtr,ytr)
print("Heart Stump Train",stump.score(Xtr,ytr),"Test",stump.score(Xte,yte))
print("Heart Stump CM\n",confusion_matrix(yte,stump.predict(Xte)))
print("Heart Stump Report\n",classification_report(yte,stump.predict(Xte)))
n_estimators=[5,10,25,50,100]
learning_rate=[0.1,0.5,1.0]
results=[]
for lr in learning_rate:
    accs=[]
    for n_est in n_estimators:
        m=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=n_est,learning_rate=lr,random_state=42)
        m.fit(Xtr,ytr)
        accs.append(m.score(Xte,yte))
        results.append((lr,n_est,m.score(Xte,yte),m))
    plt.plot(n_estimators,accs,label=f'lr={lr}')
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.legend()
plt.title('n_estimators vs accuracy for different learning rates')
plt.show()
best=max(results,key=lambda x:x[2])
print("Best config lr,n_est,acc",best[0],best[1],best[2])
best_model=best[3]
errors=[]
weights_history=[]
for est in best_model.estimators_:
    pred=est.predict(Xtr)
    miss=(pred!=ytr).astype(int)
    err=miss.mean()
    errors.append(err)
weights_history=best_model.estimator_weights_
plt.figure()
plt.plot(range(1,len(errors)+1),errors,marker='o')
plt.xlabel('Iteration')
plt.ylabel('Weak learner error')
plt.title('Weak learner error vs iteration')
plt.show()
plt.figure()
plt.hist(best_model.estimator_weights_,bins=20)
plt.xlabel('Final estimator weights')
plt.title('Distribution of estimator weights')
plt.show()
importances=best_model.feature_importances_
if hasattr(Xt,'toarray'):
    feat_names=ct.get_feature_names_out()
else:
    feat_names=ct.get_feature_names_out()
imp_df=pd.Series(importances,index=feat_names).sort_values(ascending=False)
print("Top 5 features\n",imp_df.head(5))
print("Feature importances explanation: top features likely relate to known medical risk factors")


In [None]:
# Q3
import os
path='/mnt/data/WISDM_ar_v1.1_raw.txt'
rows=[]
with open(path,'r') as f:
    for line in f:
        parts=line.strip().split(',')
        if len(parts)>=5:
            user,activity,timestamp,x,y,z=parts[0],parts[1],parts[2],parts[3],parts[4]
            try:
                rows.append([user,activity,float(timestamp),float(x),float(y),float(z)])
            except:
                continue
wdf=pd.DataFrame(rows,columns=['user','activity','timestamp','x','y','z'])
vig=set(['Jogging','Upstairs','UP','jogging','upstairs'])
light=set(['Walking','Sitting','Standing','Downstairs','walking','sitting','standing','downstairs'])
def label(a):
    if any(v.lower() in a.lower() for v in ['jog','up']):
        return 1
    return 0
wdf['label']=wdf['activity'].apply(label)
wdf=wdf.dropna()
X=wdf[['x','y','z']].values
y=wdf['label'].values
from sklearn.model_selection import train_test_split
Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
Xtr_s=sc.fit_transform(Xtr)
Xte_s=sc.transform(Xte)
stump=DecisionTreeClassifier(max_depth=1,random_state=42)
stump.fit(Xtr_s,ytr)
print("WISDM Stump Train",stump.score(Xtr_s,ytr),"Test",stump.score(Xte_s,yte))
T=20
n=Xtr_s.shape[0]
w=np.ones(n)/n
models=[]
alphas=[]
errors=[]
for t in range(1,T+1):
    clf=DecisionTreeClassifier(max_depth=1,random_state=42)
    clf.fit(Xtr_s,ytr,sample_weight=w)
    pred=clf.predict(Xtr_s)
    miss=(pred!=ytr).astype(int)
    err=np.dot(w,miss)/w.sum()
    if err==0:
        alpha=0.5*np.log((1-1e-12)/(1e-12))
    else:
        alpha=0.5*np.log((1-err)/max(err,1e-12))
    w=w*np.exp(alpha*miss*2- alpha*(1-miss)*2)
    w=w/w.sum()
    models.append((clf,alpha))
    alphas.append(alpha)
    errors.append(err)
    print("Iteration",t,"Mis idx",np.where(miss==1)[0][:20],"Weights sample",np.round(w[np.where(miss==1)[0][:20]],6))
plt.figure()
plt.plot(range(1,T+1),errors,marker='o')
plt.xlabel('Round')
plt.ylabel('Weighted error')
plt.show()
plt.figure()
plt.plot(range(1,T+1),alphas,marker='o')
plt.xlabel('Round')
plt.ylabel('Alpha')
plt.show()
def ens_predict(models,X):
    s=sum(a*(m.predict(X)*2-1) for m,a in models)
    return ((np.sign(s)+1)//2).astype(int)
train_pred=ens_predict(models,Xtr_s)
test_pred=ens_predict(models,Xte_s)
print("Manual AdaBoost WISDM Train",accuracy_score(ytr,train_pred),"Test",accuracy_score(yte,test_pred))
print("Confusion Matrix\n",confusion_matrix(yte,test_pred))
skada=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=100,learning_rate=1.0,random_state=42)
skada.fit(Xtr_s,ytr)
print("Sklearn AdaBoost WISDM Train",skada.score(Xtr_s,ytr),"Test",skada.score(Xte_s,yte))
print("Sklearn Conf Matrix\n",confusion_matrix(yte,skada.predict(Xte_s)))
