# **Data Science Buildables Fellowship**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

### **HR Employee Attrition**

In [2]:
df = pd.read_csv(r"D:\Buildables Internship\BuildablesDataScienceFellowship\Task#12\HR-Employee-Attrition.csv")
df.fillna(method="ffill", inplace=True)
y = df['Attrition'].map({'Yes':1, 'No':0})
X = pd.get_dummies(df.drop(columns=['Attrition']), drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sc = StandardScaler(); X_train = sc.fit_transform(X_train); X_test = sc.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))


Precision: 0.6153846153846154
Recall: 0.3404255319148936
F1-score: 0.4383561643835616


  df.fillna(method="ffill", inplace=True)


### **Heart Disease Prediction**

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score 

# Load dataset
df = pd.read_csv(r"D:\Buildables Internship\BuildablesDataScienceFellowship\Task#12\heart_disease.csv")

# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# Separate features and target
y = df['Heart Disease Status'].map({'Yes':1, 'No':0})  # encode target
X = df.drop(columns=['Heart Disease Status'])

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Train KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predictions
y_pred = knn.predict(X_test)
y_proba = knn.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Accuracy: 0.7625
ROC-AUC: 0.5094546875


### **Hospital Readmissions**

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv(r"D:\Buildables Internship\BuildablesDataScienceFellowship\Task#12\hospital_readmissions_30k.csv")

# Map target0
y = df['readmitted_30_days'].apply(lambda v: 1 if str(v).strip().lower() == "yes" else 0)

# One-hot encode categorical predictors
X = pd.get_dummies(df.drop(columns=['readmitted_30_days']), drop_first=True)

# Train-test split with stratification to handle class imbalance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))


Precision: 0.22727272727272727
Recall: 0.006802721088435374
F1-score: 0.013210039630118891


### **Credit Card**

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score

df = pd.read_csv(r"D:\Buildables Internship\BuildablesDataScienceFellowship\Task#12\creditcard.csv")
y = df['Class']
X = df.drop(columns=['Class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sc = StandardScaler(); X_train = sc.fit_transform(X_train); X_test = sc.transform(X_test)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
y_proba = dt.predict_proba(X_test)[:,1]

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Confusion Matrix:
 [[56840    24]
 [   25    73]]
ROC-AUC: 0.8722379497662881


### **Wine Quality**

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

df = pd.read_csv(r"D:\Buildables Internship\BuildablesDataScienceFellowship\Task#12\winequality-red.csv")
y = (df['quality'] >= 7).astype(int)
X = df.drop(columns=['quality'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sc = StandardScaler(); X_train = sc.fit_transform(X_train); X_test = sc.transform(X_test)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
y_proba = dt.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Accuracy: 0.90625
ROC-AUC: 0.8181512887247082


### **Spam.csv**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

df = pd.read_csv(r"D:\Buildables Internship\BuildablesDataScienceFellowship\Task#12\spam.csv", encoding="latin-1")[['v1','v2']]
df = df.rename(columns={'v1':'label','v2':'message'})
df['label'] = df['label'].map({'spam':1, 'ham':0})

X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vec = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
y_proba = nb.predict_proba(X_test)[:,1]

print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Precision: 0.9916666666666667
Recall: 0.7986577181208053
F1-score: 0.8847583643122676
ROC-AUC: 0.9883210360304029


### **Diabetes**

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

df = pd.read_csv(r"D:\Buildables Internship\BuildablesDataScienceFellowship\Task#12\diabetes.csv")
y = df['Outcome']
X = df.drop(columns=['Outcome'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sc = StandardScaler(); X_train = sc.fit_transform(X_train); X_test = sc.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Accuracy: 0.7597402597402597
ROC-AUC: 0.8147222222222222


### **Iris Dataset**

In [9]:


from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sc = StandardScaler(); X_train = sc.fit_transform(X_train); X_test = sc.transform(X_test)

svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))

Accuracy: 0.9666666666666667
Precision (macro): 0.9696969696969697
Recall (macro): 0.9666666666666667


### **Breast Cancer**

In [10]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sc = StandardScaler(); X_train = sc.fit_transform(X_train); X_test = sc.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.956140350877193
Confusion Matrix:
 [[39  3]
 [ 2 70]]
