In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

# Q1

In [118]:
df = pd.read_csv('../data/05/heart.csv')

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [120]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
l1_lr = LogisticRegression(penalty='l1', solver='liblinear')
l1_lr.fit(X_train, y_train)

train_l1 = accuracy_score(y_train, l1_lr.predict(X_train))
test_l1 = accuracy_score(y_test, l1_lr.predict(X_test))

In [122]:
l2_lr = LogisticRegression(penalty='l2', solver='liblinear')
l2_lr.fit(X_train, y_train)

train_l2 = accuracy_score(y_train, l2_lr.predict(X_train))
test_l2 = accuracy_score(y_test, l2_lr.predict(X_test))

In [123]:
en_lr = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga', max_iter=10000)
en_lr.fit(X_train, y_train)

train_en = accuracy_score(y_train, en_lr.predict(X_train))
test_en = accuracy_score(y_test, en_lr.predict(X_test))

In [124]:
results = pd.DataFrame({
    'Model': ['L1', 'L2', 'Elastic Net'],
    'Train Accuracy': [train_l1, train_l2, train_en],
    'Test Accuracy': [test_l1, test_l2, test_en]
})

results

Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,L1,0.873171,0.785366
1,L2,0.869512,0.785366
2,Elastic Net,0.860976,0.790244


#### L1 shows error if solver is lbfgs
#### Elastic Net shows error if solver is not saga and needs l1_ratio

# Q2

In [125]:
df = pd.read_csv('../data/Iris.csv')

df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [127]:
df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [128]:
X = df.drop(['Species', 'Id'], axis=1)
y = df['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [129]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)
lr.fit(X_train, y_train)

train_lbfgs = accuracy_score(y_train, lr.predict(X_train))
test_lbfgs = accuracy_score(y_test, lr.predict(X_test))



In [130]:
lr = LogisticRegression(multi_class='multinomial', solver='newton-cholesky', max_iter=2000)
lr.fit(X_train, y_train)

train_newton = accuracy_score(y_train, lr.predict(X_train))
test_newton = accuracy_score(y_test, lr.predict(X_test))



In [131]:
lr = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=20000)
lr.fit(X_train, y_train)

train_sag = accuracy_score(y_train, lr.predict(X_train))
test_sag = accuracy_score(y_test, lr.predict(X_test))



In [132]:
lr = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=20000)
lr.fit(X_train, y_train)

train_saga = accuracy_score(y_train, lr.predict(X_train))
test_saga = accuracy_score(y_test, lr.predict(X_test))



In [133]:
results = pd.DataFrame({
    'Solver': ['lbfgs', 'newton-cholesky', 'sag', 'saga'],
    'Train Accuracy': [train_lbfgs, train_newton, train_sag, train_saga],
    'Test Accuracy': [test_lbfgs, test_newton, test_sag, test_saga]
})

results

Unnamed: 0,Solver,Train Accuracy,Test Accuracy
0,lbfgs,0.975,1.0
1,newton-cholesky,0.975,1.0
2,sag,0.975,1.0
3,saga,0.983333,1.0


# Q3

In [134]:
df['Target'] = (df['Species'] == 'Iris-setosa').astype(int)

In [135]:
X = df.drop(['Species', 'Target', 'Id'], axis=1)
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [136]:
X_train.shape[1]

4

In [137]:
model = Sequential()
model.add(Dense(1, activation='sigmoid', input_shape=(X_train.shape[1],)))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [138]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step - accuracy: 0.6302 - loss: 0.5530 - val_accuracy: 0.7917 - val_loss: 0.3424
Epoch 2/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.6185 - loss: 0.5645 - val_accuracy: 0.7917 - val_loss: 0.3376
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.6615 - loss: 0.5054 - val_accuracy: 0.7917 - val_loss: 0.3330
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.6536 - loss: 0.4978 - val_accuracy: 0.7917 - val_loss: 0.3283
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5911 - loss: 0.5806 - val_accuracy: 0.7917 - val_loss: 0.3238


<keras.src.callbacks.history.History at 0x212a6124620>

In [139]:
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)

display(accuracy_score(y_test, y_pred))
display(accuracy_score(y_train, np.round(model.predict(X_train))))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


0.6666666666666666

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


0.6666666666666666

# Q4

In [140]:
df = pd.read_csv("../data/09/WineQT.csv")

df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
 12  Id                    1143 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 116.2 KB


In [142]:
df['quality'].value_counts()

quality
5    483
6    462
7    143
4     33
8     16
3      6
Name: count, dtype: int64

In [143]:
X = df.drop(['Id', 'quality'], axis=1)
y = df['quality'] - 3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [144]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='relu'))
model.add(Dense(6, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_batch_size=0.2)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4636 - loss: 1.2689
Epoch 2/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4756 - loss: 1.1650 
Epoch 3/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4985 - loss: 1.1464
Epoch 4/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4990 - loss: 1.1451 
Epoch 5/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4648 - loss: 1.1469 
Epoch 6/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4894 - loss: 1.1319 
Epoch 7/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5125 - loss: 1.1164
Epoch 8/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4919 - loss: 1.1084
Epoch 9/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x21295e6c620>

In [145]:
y_train_pred = np.argmax(model.predict(X_train), axis=1)
y_test_pred  = np.argmax(model.predict(X_test), axis=1)

print("Accuracy Train: ", accuracy_score(y_train, y_train_pred))
print("Accuracy Test: ", accuracy_score(y_test, y_test_pred))
print()
print("Precision Train: ", precision_score(y_train, y_train_pred, average='weighted'))
print("Precision Test: ", precision_score(y_test, y_test_pred, average='weighted'))
print()
print("Recall Train: ", recall_score(y_train, y_train_pred, average='weighted'))
print("Recall Test: ", recall_score(y_test, y_test_pred, average='weighted'))
print()
print("F1 Score Train: ", f1_score(y_train, y_train_pred, average='weighted'))
print("F1 Score Test: ", f1_score(y_test, y_test_pred, average='weighted'))

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Accuracy Train:  0.6203501094091903
Accuracy Test:  0.6462882096069869

Precision Train:  0.5859608995903295
Precision Test:  0.6208196170641586

Recall Train:  0.6203501094091903
Recall Test:  0.6462882096069869

F1 Score Train:  0.6008875999778163
F1 Score Test:  0.633029416282543


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Q5

In [3]:
df = pd.read_csv('../data/09/data.csv')

df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [5]:
obj_cols = df.select_dtypes(include=['object']).columns
obj_cols = obj_cols.drop('date')
df = pd.get_dummies(df, columns=obj_cols, drop_first=True)

In [10]:
X = df.drop(['price', 'date'], axis=1)
y = df['price']

scaler_x = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_x.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1,1))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [11]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer="adam", loss='mean_squared_error', metrics=['mean_squared_error', 'mean_absolute_error'])

model.fit(X_train, y_train, batch_size=32, epochs=50, validation_batch_size=0.2)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 0.4820 - mean_absolute_error: 0.4527 - mean_squared_error: 0.4820
Epoch 2/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.2155 - mean_absolute_error: 0.3170 - mean_squared_error: 0.2155
Epoch 3/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0975 - mean_absolute_error: 0.1873 - mean_squared_error: 0.0975
Epoch 4/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0805 - mean_absolute_error: 0.1313 - mean_squared_error: 0.0805
Epoch 5/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0524 - mean_absolute_error: 0.1008 - mean_squared_error: 0.0524
Epoch 6/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0339 - mean_absolute_error: 0.0800 - mean_squared_error: 0.0339
Epoch 7/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x245c73a2300>

In [12]:
# Predict
y_pred_train_scaled = model.predict(X_train)
y_pred_test_scaled = model.predict(X_test)

# Inverse-transform predictions AND y back to original price scale
y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)
y_train_orig = scaler_y.inverse_transform(y_train)
y_test_orig = scaler_y.inverse_transform(y_test)

# Metrics
print("MSE Train:", mean_squared_error(y_train_orig, y_pred_train))
print("MSE Test:", mean_squared_error(y_test_orig, y_pred_test))
print()
print("MAE Train:", mean_absolute_error(y_train_orig, y_pred_train))
print("MAE Test:", mean_absolute_error(y_test_orig, y_pred_test))

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
MSE Train: 111738399.5198078
MSE Test: 2004834576109.181

MAE Train: 6169.540674996014
MAE Test: 841924.5226550099
