# **Part 3: Machine Learning Fundamentals**
-------------------------------------------

### **Task 1: Machine Learning Basics**

#### **1. Data Splitting:**

+ ### **Remove null values**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
training_data = pd.read_csv('train.csv')

In [3]:
training_data.shape  # (rows, columns)

(891, 12)

In [4]:
training_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
training_data.isnull() # checking the missing value

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
training_data.isnull().sum() # checking the missing values per columns

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
training_data['Age'] = training_data['Age'].fillna(training_data['Age'].mean())
training_data['Age'] = training_data['Age'].astype(int)
training_data['Age']

0      22
1      38
2      26
3      35
4      35
       ..
886    27
887    19
888    29
889    26
890    32
Name: Age, Length: 891, dtype: int32

In [8]:
training_data['Embarked'] = training_data['Embarked'].fillna(training_data['Embarked'].mode()[0])
training_data['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [9]:
training_data = training_data.drop('Cabin', axis=1)
training_data.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,S


In [10]:
training_data.isnull().sum() # again check null values 

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [11]:
X = training_data.drop('Survived', axis=1)
y = training_data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set size: {len(X_train)}')
print(f'Testing set size: {len(X_test)}')

Training set size: 712
Testing set size: 179


+ ### **Encoding the training data**

In [12]:
training_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [13]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    int32  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(1), int32(1), int64(5), object(4)
memory usage: 73.2+ KB


In [14]:
sex_en = LabelEncoder()
training_data['Sex']= sex_en.fit_transform(training_data['Sex'])

In [15]:
embarked_en = LabelEncoder()
training_data['Embarked']= embarked_en.fit_transform(training_data['Embarked'])

In [16]:
name_en = LabelEncoder()
training_data['Name']= name_en.fit_transform(training_data['Name'])

In [17]:
ticket_en = LabelEncoder()
training_data['Ticket']= ticket_en.fit_transform(training_data['Ticket'])

+ #### **Split the dataset into training and testing sets (80% train, 20% test)**

In [18]:
X = training_data.drop('Survived', axis=1)
y = training_data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set size: {len(X_train)}')
print(f'Testing set size: {len(X_test)}')

Training set size: 712
Testing set size: 179


+ #### **Train a logistic regression model on the training set**

+ supervised classification model 
+ dependent variable is catagorial and binary(0,1)
+ we use sigmoid function ( give values in between (0,1)) , Y= 1/1+e**-(a0 - a1 x) , a0 = intercept , a1 = coffecient , x= independent variable, Y = result 

In [19]:
# Split the dataset into features (X) and target (y)
from sklearn.linear_model import LogisticRegression

In [20]:
# Train the logistic regression lor
lor = LogisticRegression(random_state=42)
lor.fit(X_train, y_train)

In [21]:
# Print the lor coefficients and intercept
print('Logistic Regression Model Coefficients:')
print(dict(zip(X.columns, lor.coef_[0]))) # type: ignore
print(f'Intercept: {lor.intercept_[0]}') # type: ignore

Logistic Regression Model Coefficients:
{'PassengerId': 0.0010895257101938627, 'Pclass': -0.20982116605032894, 'Name': 0.00022813484712416558, 'Sex': -2.232441233975065, 'Age': 0.005430444902670999, 'SibSp': -0.39058374926684525, 'Parch': 0.030077372801801408, 'Ticket': -0.00011745340660767865, 'Fare': 0.011224890818493349, 'Embarked': -0.030357816665227653}
Intercept: 0.4287337223087307


In [22]:
# predict the value new unseen data
pred = lor.predict(X_test) # type: ignore
pred

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0], dtype=int64)

In [23]:
X_test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
709,710,3,561,1,29,1,1,189,15.2458,0
439,440,2,447,1,31,0,0,547,10.5,2
840,841,3,11,1,20,0,0,647,7.925,2


+ #### **Evaluate the model using accuracy, precision, recall, and F1-score**

  + **Check Accuracy:**
  + Accuracy tell you the percentage of mistakes in model. 

In [24]:
# call both alogorithm 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

In [25]:
# make two objects 
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier()

In [26]:
# train the same data 
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)

In [27]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix
print("Accuracy of Logistic Regression", accuracy_score(y_test, y_pred1))
print("Accuracy of Decision Trees", accuracy_score(y_test, y_pred2))

Accuracy of Logistic Regression 0.7597765363128491
Accuracy of Decision Trees 0.7877094972067039


+ **Check Confusion:**
+ It is tell you the nature of mistakes.  

In [29]:
print("Logistic Regression Confusion Metrix\n")
pd.DataFrame(confusion_matrix(y_test, y_pred1), columns=[0,1])

Logistic Regression Confusion Metrix



Unnamed: 0,0,1
0,89,16
1,27,47


In [30]:
print("Decision Tree Confusion Metrix\n")
pd.DataFrame(confusion_matrix(y_test, y_pred2), columns=[0,1])

Decision Tree Confusion Metrix



Unnamed: 0,0,1
0,84,21
1,17,57


+ **Precision**
+ Precision = TP/(TP+FP)
+  **Recall**
+ Recall = TP/(TP+FN) 
+ **F1-score**
+ Its simply called combination of precision and recall or average of precision and recall
+ F1-score = 2PR/P+R 

In [31]:
from sklearn.metrics import recall_score, precision_score, f1_score

In [32]:
#For Logistic Regression
print("Logistic Regression Confusion Metrix\n")
print("-"*50)
cdf = pd.DataFrame(confusion_matrix(y_test, y_pred1), columns=[0,1])
print(cdf)
print("-"*50)
print("Precision - ", precision_score(y_test, y_pred1))
print("Recall - ", recall_score(y_test, y_pred1))
print("F1-score - ", f1_score(y_test, y_pred1))



Logistic Regression Confusion Metrix

--------------------------------------------------
    0   1
0  89  16
1  27  47
--------------------------------------------------
Precision -  0.746031746031746
Recall -  0.6351351351351351
F1-score -  0.6861313868613139


In [33]:
# For Decision Tree
print("Logistic Regression Confusion Metrix\n")
print("-"*50)
cdf = pd.DataFrame(confusion_matrix(y_test, y_pred2), columns=[0,1])
print(cdf)
print("-"*50)
print("Precision - ", precision_score(y_test, y_pred2))
print("Recall - ", recall_score(y_test, y_pred2))
print("F1-score - ", f1_score(y_test, y_pred2))


Logistic Regression Confusion Metrix

--------------------------------------------------
    0   1
0  84  21
1  17  57
--------------------------------------------------
Precision -  0.7307692307692307
Recall -  0.7702702702702703
F1-score -  0.75


### **Task 2: Intermediate Machine Learning**

#### **1. Hyperparameter Tuning:**

+ #### **Perform hyperparameter tuning on a decision tree classifier using grid search**

In [34]:
clf2

In [35]:
# make a dictionary of hyperparameter values to search 
search_space = {
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_split': [2, 4, 6, 8, 10],
    'criterion': ['gini', 'entropy']
    }

In [36]:
from sklearn.model_selection import GridSearchCV
# make a gridsearch object
GS= GridSearchCV(estimator=clf2, # model 
                 param_grid= search_space, 
                 cv=5,
                 scoring='accuracy'
)

In [37]:
GS.fit(X_train, y_train)

In [38]:
best_params = GS.best_params_
best_score = GS.best_score_

print("Best Hyperparameters:\n", best_params)
print("Best Accuracy Score:", best_score)

Best Hyperparameters:
 {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}
Best Accuracy Score: 0.8145769723234512


In [39]:
# Evaluate the best model on the test set
best_model = GS.best_estimator_
y_pred = best_model.predict(X)
accuracy = accuracy_score(y, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8282828282828283


In [40]:
df = pd.DataFrame(GS.cv_results_)
df.to_csv("cv_results.csv")

#### **2. Cross-Validation:**

+ #### **Implement cross-validation to ensure robust model evaluation**

+ Cross Validation is the resampling techniuqe 
+ It helps estimating how well the model will perform on an independent dataset.

In [41]:
from sklearn.model_selection import cross_val_score
clf2

In [42]:
scores = cross_val_score(clf2, X, y, cv=10, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())

Cross-Validation Scores: [0.71111111 0.73033708 0.73033708 0.68539326 0.71910112 0.80898876
 0.76404494 0.79775281 0.83146067 0.80898876]
Mean Accuracy: 0.7587515605493134
Standard Deviation: 0.0476668242540527


#### **3. Model Comparison:**

+ #### **Train and compare multiple models (e.g., logistic regression, decision tree, random forest, SVM)**

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [44]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

In [45]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")

Logistic Regression Accuracy: 0.76
Decision Tree Accuracy: 0.80
Random Forest Accuracy: 0.84
SVM Accuracy: 0.67


+ #### **Select the best-performing model based on cross-validation scores**

In [46]:
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f})")

# Select the best-performing model
best_model_name = max(models, key=lambda x: cross_val_score(models[x], X, y, cv=5, scoring='accuracy').mean())
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")

Logistic Regression Cross-Validation Accuracy: 0.76 (+/- 0.05)
Decision Tree Cross-Validation Accuracy: 0.73 (+/- 0.08)
Random Forest Cross-Validation Accuracy: 0.82 (+/- 0.03)
SVM Cross-Validation Accuracy: 0.66 (+/- 0.03)
Best Model: Random Forest


# **Part 4: Feature Engineering**
------------------------------------

### **Task 1: Advanced Feature Engineering**

#### **1. Interaction Features:**

+ #### **Create interaction features between 'Pclass', 'Sex', and 'Age'**

In [47]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(include_bias=False, interaction_only=True)

In [48]:
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,3,108,1,22,1,0,523,7.2500,2
1,2,1,190,0,38,1,0,596,71.2833,0
2,3,3,353,0,26,0,0,669,7.9250,2
3,4,1,272,0,35,1,0,49,53.1000,2
4,5,3,15,1,35,0,0,472,8.0500,2
...,...,...,...,...,...,...,...,...,...,...
886,887,2,548,1,27,0,0,101,13.0000,2
887,888,1,303,0,19,0,0,14,30.0000,2
888,889,3,413,0,29,1,2,675,23.4500,2
889,890,1,81,1,26,0,0,8,30.0000,0


In [49]:
X['Pclass_Sex'] = X['Pclass'] * X['Sex']
X['Pclass_Age'] = X['Pclass'] * X['Age']
X['Sex_Age'] = X['Sex'] * X['Age']
X


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Pclass_Sex,Pclass_Age,Sex_Age
0,1,3,108,1,22,1,0,523,7.2500,2,3,66,22
1,2,1,190,0,38,1,0,596,71.2833,0,0,38,0
2,3,3,353,0,26,0,0,669,7.9250,2,0,78,0
3,4,1,272,0,35,1,0,49,53.1000,2,0,35,0
4,5,3,15,1,35,0,0,472,8.0500,2,3,105,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,548,1,27,0,0,101,13.0000,2,2,54,27
887,888,1,303,0,19,0,0,14,30.0000,2,0,19,0
888,889,3,413,0,29,1,2,675,23.4500,2,0,87,0
889,890,1,81,1,26,0,0,8,30.0000,0,1,26,26


In [50]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")

Logistic Regression Accuracy: 0.76
Decision Tree Accuracy: 0.78
Random Forest Accuracy: 0.85
SVM Accuracy: 0.67


#### **2. Feature Selection:**

+ #### **Perform feature selection using mutual information and feature importance from models**

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif

In [52]:
feature_names = X.columns
mutual_info = mutual_info_classif(X, y)
print("Mutual Information Scores:")
for score, name in sorted(zip(mutual_info, feature_names), reverse=True):
    print(f"{name}: {score:.2f}")

Mutual Information Scores:
Sex_Age: 0.18
Pclass_Sex: 0.17
Fare: 0.14
Sex: 0.13
Ticket: 0.12
Pclass_Age: 0.07
Pclass: 0.06
SibSp: 0.04
Parch: 0.04
Age: 0.02
PassengerId: 0.01
Embarked: 0.01
Name: 0.00


In [53]:
# Train a Random Forest Classifier and get feature importances
rf = RandomForestClassifier()
rf.fit(X, y)
importances = rf.feature_importances_

# Print feature importances
print("\nFeature Importances:")
for importance, name in sorted(zip(importances, feature_names), reverse=True):
    print(f"{name}: {importance:.2f}")


Feature Importances:
Sex_Age: 0.17
Ticket: 0.11
Pclass_Sex: 0.11
Name: 0.10
Fare: 0.10
PassengerId: 0.09
Pclass_Age: 0.08
Age: 0.06
Sex: 0.06
Pclass: 0.06
SibSp: 0.03
Embarked: 0.02
Parch: 0.02


#### **3. Dimensionality Reduction:**

+ #### **Apply PCA to reduce the dimensionality of the feature space**

In [54]:
from sklearn.decomposition import PCA
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

In [55]:
# Explain variance ratio
print("Explained Variance Ratio:")
print(pca.explained_variance_ratio_)

# Select the number of principal components
n_components = 2

Explained Variance Ratio:
[0.25503707 0.15745314 0.10937882 0.09519186 0.08410976 0.07615576
 0.07311125 0.05980811 0.04315258 0.03287232 0.00848062 0.00323907
 0.00200965]


In [56]:
# Transform the data using the selected number of principal components
X_reduced = pca.transform(X_scaled)[:, :n_components]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

In [57]:
X_reduced

array([[ 1.24284898,  1.44992043],
       [-2.55486021, -1.04396102],
       [-0.7902469 ,  1.43852622],
       ...,
       [-1.36195185,  1.80005388],
       [-0.83482419, -2.15517536],
       [ 2.02118915,  0.09926372]])

In [58]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


# **Part 5: Intermediate Python and Object-Oriented Programming (OOP)**
----------------------------

### **Task 1: Intermediate Python I**

#### **1. Modular Code:**

+ #### **Write Python functions to modularize your code for data cleaning, feature engineering, and model training**

In [59]:
import pandas as pd
import numpy as np

+ Data Cleaning Function

In [60]:
def clean_data(df):
    
    # Drop any rows with missing values
    cleaned_df = df.dropna()
    
    # Remove any duplicate rows
    cleaned_df = cleaned_df.drop_duplicates()
    
    # Convert categorical variables into dummy/indicator variables
    categorical_cols = cleaned_df.select_dtypes(include=['object']).columns
    cleaned_df = pd.get_dummies(cleaned_df, columns=categorical_cols, drop_first=True)
    
    return cleaned_df

+ Feature Engineering Function

In [61]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def engineer_features(df, n_components):
   
    # Standardize features by removing the mean and scaling to unit variance
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df)
    
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(scaled_features)
    
    # Convert the result back to a DataFrame
    engineered_df = pd.DataFrame(principal_components, columns=[f'PC{i+1}' for i in range(n_components)])
    
    return engineered_df

+ Model Training Function

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def train_model(X, y):
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # Predict on the test set and evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    
    return model, accuracy

In [63]:
training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,108,1,22,1,0,523,7.25,2
1,2,1,1,190,0,38,1,0,596,71.2833,0
2,3,1,3,353,0,26,0,0,669,7.925,2
3,4,1,1,272,0,35,1,0,49,53.1,2
4,5,0,3,15,1,35,0,0,472,8.05,2


In [64]:

# Clean the data
cleaned_df = clean_data(training_data)

# Engineer features
engineered_df = engineer_features(cleaned_df.drop('Survived', axis=1), n_components=2)

# Train the model
model, accuracy = train_model(engineered_df, cleaned_df['Survived'])

print(f"Model Accuracy: {accuracy}")

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       105
           1       0.71      0.54      0.62        74

    accuracy                           0.72       179
   macro avg       0.72      0.69      0.70       179
weighted avg       0.72      0.72      0.71       179

Model Accuracy: 0.7206703910614525


#### **2. List Comprehensions and Lambda Functions:**

+ #### **Use list comprehensions and lambda functions to simplify data manipulation tasks**

+ Filtering Data

In [65]:
numbers = [5, 15, 3, 20, 8, 18]

# Using a for loop
filtered_numbers_loop = []
for num in numbers:
    if num >= 10:
        filtered_numbers_loop.append(num)

# Using list comprehension
filtered_numbers_comp = [num for num in numbers if num >= 10]

print(filtered_numbers_loop)  
print(filtered_numbers_comp)  

[15, 20, 18]
[15, 20, 18]


+ Transforming Data

In [66]:
numbers = [1, 2, 3, 4, 5]

# Using a for loop
squared_numbers_loop = []
for num in numbers:
    squared_numbers_loop.append(num ** 2)

# Using list comprehension
squared_numbers_comp = [num ** 2 for num in numbers]

# Using map with a lambda function
squared_numbers_lambda = list(map(lambda x: x ** 2, numbers))

print(squared_numbers_loop)    
print(squared_numbers_comp)    
print(squared_numbers_lambda)  

[1, 4, 9, 16, 25]
[1, 4, 9, 16, 25]
[1, 4, 9, 16, 25]


+ Sorting Data

In [67]:
people = [
    {'name': 'Alice', 'age': 25},
    {'name': 'Bob', 'age': 30},
    {'name': 'Charlie', 'age': 20}
]

# Using sorted with a lambda function
sorted_people = sorted(people, key=lambda x: x['age'])

print(sorted_people)

[{'name': 'Charlie', 'age': 20}, {'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]


+ Grouping Data

In [68]:
items = [
    {'name': 'Apple', 'category': 'Fruit'},
    {'name': 'Carrot', 'category': 'Vegetable'},
    {'name': 'Banana', 'category': 'Fruit'},
    {'name': 'Broccoli', 'category': 'Vegetable'}
]

# Using a dictionary comprehension
grouped_items = {category: [item for item in items if item['category'] == category] for category in set(item['category'] for item in items)}

print(grouped_items)

{'Fruit': [{'name': 'Apple', 'category': 'Fruit'}, {'name': 'Banana', 'category': 'Fruit'}], 'Vegetable': [{'name': 'Carrot', 'category': 'Vegetable'}, {'name': 'Broccoli', 'category': 'Vegetable'}]}


+ Example Use Case: Data Analysis

In [69]:
scores = [85, 90, 78, 92, 88, 76, 95, 89]

# Filter out scores less than 80
filtered_scores = [score for score in scores if score >= 80]

# Calculate the average score
average_score = sum(filtered_scores) / len(filtered_scores)

# Sort the scores in descending order
sorted_scores = sorted(filtered_scores, reverse=True)

# Group scores by grade (A: 90-100, B: 80-89)
grouped_scores = {'A': [score for score in filtered_scores if score >= 90], 'B': [score for score in filtered_scores if score < 90]}

print(f"Average Score: {average_score}")
print(f"Sorted Scores: {sorted_scores}")
print(f"Grouped Scores: {grouped_scores}")

Average Score: 89.83333333333333
Sorted Scores: [95, 92, 90, 89, 88, 85]
Grouped Scores: {'A': [90, 92, 95], 'B': [85, 88, 89]}


#### **3. Error Handling:**

+ #### **Implement error handling to make your code robust**

+ Example: Data Analysis with Error Handling

In [70]:
def analyze_scores(scores):
    try:
        # Check if scores is a list
        if not isinstance(scores, list):
            raise ValueError("Scores must be a list")

        # Check if scores is empty
        if len(scores) == 0:
            raise ValueError("Scores cannot be empty")

        # Calculate the average score
        average_score = sum(scores) / len(scores)

        # Filter out scores less than 80
        filtered_scores = [score for score in scores if score >= 80]

        # Sort the scores in descending order
        sorted_scores = sorted(filtered_scores, reverse=True)

        # Group scores by grade (A: 90-100, B: 80-89)
        grouped_scores = {'A': [score for score in filtered_scores if score >= 90], 'B': [score for score in filtered_scores if score < 90]}

        return average_score, sorted_scores, grouped_scores

    except ValueError as ve:
        print(f"Error: {ve}")
    except ZeroDivisionError:
        print("Error: Cannot calculate average score for an empty list")
    except Exception as e:
        print(f"Error: {e}")

# Example usage
scores = [85, 90, 78, 92, 88, 76, 95, 89]
average_score, sorted_scores, grouped_scores = analyze_scores(scores)

print(f"Average Score: {average_score}")
print(f"Sorted Scores: {sorted_scores}")
print(f"Grouped Scores: {grouped_scores}")

Average Score: 86.625
Sorted Scores: [95, 92, 90, 89, 88, 85]
Grouped Scores: {'A': [90, 92, 95], 'B': [85, 88, 89]}


### **Task 2: Intermediate Python II**

#### **1. Advanced Data Structures:**

+ #### **Use sets and dictionaries to optimize your code**

+ Example 1: Removing Duplicates from a List

In [71]:
numbers = [1, 2, 2, 3, 4, 4, 5, 6, 6]

# Using a list comprehension
unique_numbers_list = []
for num in numbers:
    if num not in unique_numbers_list:
        unique_numbers_list.append(num)

print(unique_numbers_list)  
# Using a set
unique_numbers_set = set(numbers)
print(unique_numbers_set)  

[1, 2, 3, 4, 5, 6]
{1, 2, 3, 4, 5, 6}


+ Example 2: Counting Frequencies

In [72]:
words = ['apple', 'banana', 'apple', 'orange', 'banana', 'banana']

# Using a dictionary
word_freq = {}
for word in words:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1

print(word_freq)  

# Using a dictionary with defaultdict
from collections import defaultdict
word_freq_defaultdict = defaultdict(int)
for word in words:
    word_freq_defaultdict[word] += 1

print(word_freq_defaultdict)  

{'apple': 2, 'banana': 3, 'orange': 1}
defaultdict(<class 'int'>, {'apple': 2, 'banana': 3, 'orange': 1})


+ Example 3: Finding Common Elements

In [73]:
numbers1 = [1, 2, 3, 4, 5]
numbers2 = [4, 5, 6, 7, 8]

# Using a list comprehension
common_numbers_list = [num for num in numbers1 if num in numbers2]
print(common_numbers_list)  

# Using sets
common_numbers_set = set(numbers1) & set(numbers2)
print(common_numbers_set)  

[4, 5]
{4, 5}


+ Example 4: Grouping Data

In [74]:
people = [
    {'name': 'Alice', 'age': 25},
    {'name': 'Bob', 'age': 30},
    {'name': 'Charlie', 'age': 25},
    {'name': 'David', 'age': 30}
]

# Using a dictionary
people_by_age = {}
for person in people:
    if person['age'] in people_by_age:
        people_by_age[person['age']].append(person)
    else:
        people_by_age[person['age']] = [person]

print(people_by_age)
# Using a defaultdict
from collections import defaultdict
people_by_age_defaultdict = defaultdict(list)
for person in people:
    people_by_age_defaultdict[person['age']].append(person)

print(people_by_age_defaultdict)

{25: [{'name': 'Alice', 'age': 25}, {'name': 'Charlie', 'age': 25}], 30: [{'name': 'Bob', 'age': 30}, {'name': 'David', 'age': 30}]}
defaultdict(<class 'list'>, {25: [{'name': 'Alice', 'age': 25}, {'name': 'Charlie', 'age': 25}], 30: [{'name': 'Bob', 'age': 30}, {'name': 'David', 'age': 30}]})


#### **2. Decorators and Context Managers:**

+ #### **Implement decorators and context managers to manage resources efficiently**

+ Example 1: Timer Decorator

In [75]:
import time
from functools import wraps

def timer_decorator(func):
    @wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.time()
        value = func(*args, **kwargs)
        end_time = time.time()
        run_time = end_time - start_time
        print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
        return value
    return wrapper_timer

@timer_decorator
def example_func():
    for i in range(10000000):
        pass

example_func()

Finished 'example_func' in 0.2944 secs


+ Example 2: Context Manager for File Handling

In [76]:
class FileManager:
    def __init__(self, filename, mode):
        self.filename = filename
        self.mode = mode
        self.file = None

    def __enter__(self):
        self.file = open(self.filename, self.mode)
        return self.file

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.file.close()

with FileManager('example.txt', 'w') as file:
    file.write('Hello, World!')

+ Example 3: Lock Decorator for Thread Safety

In [77]:
import threading
from functools import wraps

lock = threading.Lock()

def thread_safe_decorator(func):
    @wraps(func)
    def wrapper_thread_safe(*args, **kwargs):
        with lock:
            return func(*args, **kwargs)
    return wrapper_thread_safe

@thread_safe_decorator
def example_func():
    # Thread-safe code here
    pass

example_func()

#### **3. Multithreading/Multiprocessing:**

+ #### **Apply multithreading or multiprocessing to speed up data processing tasks**

In [78]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import threading
import multiprocessing

In [79]:
def cal_square(numbers):
    print("Calculate square numbers")
    for n in numbers:
        time.sleep(0.2)
        print("square", n*n)

def cal_cube(numbers):
    print("Calculate cube numbers")
    for n in numbers:
        time.sleep(0.2)
        print("cube", n*n*n)

arr= [1,2,3,4,5,6]
t= time.time()
cal_square(arr)
cal_cube(arr)

print("done in :",time.time()-t)
print("Haa! I'm done with all my work now.")

Calculate square numbers
square 1
square 4
square 9
square 16
square 25
square 36
Calculate cube numbers
cube 1
cube 8
cube 27
cube 64
cube 125
cube 216
done in : 2.416012763977051
Haa! I'm done with all my work now.


In [80]:
def cal_square(numbers):
    print("Calculate square numbers")
    for n in numbers:
        time.sleep(0.2)
        print("square", n*n)

def cal_cube(numbers):
    print("Calculate cube numbers")
    for n in numbers:
        time.sleep(0.2)
        print("cube", n*n*n)

arr= [1,2,3,4,5,6]
t= time.time()

t1 = threading.Thread(target=cal_square, args=(arr,))
t2 = threading.Thread(target=cal_cube, args=(arr,))

t1.start()
t2.start()

t1.join()
t2.join()

print("done in :",time.time()-t)
print("Haa! I'm done with all my work now.")

Calculate square numbers
Calculate cube numbers
square 1
cube 1
square 4
cube 8
square 9
cube 27
square 16
cube 64
cube 125
square 25
cube 216
square 36
done in : 1.2141952514648438
Haa! I'm done with all my work now.


### **Task 3: Python OOP**

#### **1. Pipeline Class:**

+ #### **Design a Python class to encapsulate the data wrangling and machine learning pipeline**

In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [82]:
training_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,108,1,22,1,0,523,7.25,2
1,2,1,1,190,0,38,1,0,596,71.2833,0


In [83]:
training_data.to_csv('output.csv', index=False)

In [84]:

class DataMLPipeline:
    def __init__(self, data_path, target_column):
        self.data_path = data_path
        self.target_column = target_column
        self.data = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.pipeline = None
    
    def load_data(self):
        self.data = pd.read_csv(self.data_path)
        print("Data loaded successfully.")
    
    def preprocess_data(self):
        X = self.data.drop(self.target_column, axis=1)
        y = self.data[self.target_column]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print("Data preprocessed successfully.")
    
    def build_pipeline(self):
        self.pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression())
        ])
        print("Pipeline created successfully.")
    def train_model(self):
        self.pipeline.fit(self.X_train, self.y_train)
        print("Model trained successfully.")
    
    def evaluate_model(self):
        score = self.pipeline.score(self.X_test, self.y_test)
        print(f"Model accuracy: {score:.2f}")


In [85]:
pipeline = DataMLPipeline('output.csv', 'Survived') 
pipeline.load_data()
pipeline.preprocess_data()
pipeline.build_pipeline()
pipeline.train_model()
pipeline.evaluate_model()


Data loaded successfully.
Data preprocessed successfully.
Pipeline created successfully.
Model trained successfully.
Model accuracy: 0.82


#### **2. Class Methods:**

+ #### **Implement methods within the class for each step of the pipeline (e.g., data cleaning, feature engineering, model training)**

In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [87]:
class DataMLPipeline:
    def __init__(self, data_path, target_column):
        self.data_path = data_path
        self.target_column = target_column
        self.data = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.pipeline = None
    
    def load_data(self):
        self.data = pd.read_csv(self.data_path)
        print("Data loaded successfully.")
    
    def clean_data(self):
        # Example cleaning step: Removing rows with missing target values
        self.data.dropna(subset=[self.target_column], inplace=True)
        print("Data cleaned successfully.")
    
    def preprocess_data(self):
        X = self.data.drop(self.target_column, axis=1)
        y = self.data[self.target_column]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print("Data preprocessed successfully.")
    
    def build_pipeline(self):
        class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
            def fit(self, X, y=None):
                return self

            def transform(self, X):
                # Example feature engineering step: Adding a new feature
                X['new_feature'] = X.iloc[:, 0] * 2
                return X

        self.pipeline = Pipeline([
            ('feature_engineer', CustomFeatureEngineer()),
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression())
        ])
        print("Pipeline created successfully.")
    
    def train_model(self):
        self.pipeline.fit(self.X_train, self.y_train)
        print("Model trained successfully.")
    
    def evaluate_model(self):
        score = self.pipeline.score(self.X_test, self.y_test)
        print(f"Model accuracy: {score:.2f}")


In [88]:
pipeline = DataMLPipeline('output.csv', 'Age')
pipeline.load_data()
pipeline.clean_data()
pipeline.preprocess_data()
pipeline.build_pipeline()
pipeline.train_model()
pipeline.evaluate_model()

Data loaded successfully.
Data cleaned successfully.
Data preprocessed successfully.
Pipeline created successfully.
Model trained successfully.
Model accuracy: 0.21


#### **3. Reusability and Extendability:**

+ #### **Ensure your class is reusable and extendable for different datasets and machine learning tasks**

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [90]:
class DataPipeline:
    def __init__(self, numeric_features, categorical_features):
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.pipeline = None

    def build_pipeline(self):
        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())])

        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)])

        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression())])

    def fit(self, X_train, y_train):
        self.pipeline.fit(X_train, y_train)

    def predict(self, X_test):
        return self.pipeline.predict(X_test)
