1. Predicting Diabetes Onset Using Logistic Regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
diabetes_data=pd.read_csv("diabetes.csv")
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
print("Missing Values:")
diabetes_data.isnull().sum()

Missing Values:


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
# Standardize features
scaler = StandardScaler()
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']
X_scaled = scaler.fit_transform(X)

In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Implement logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [6]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

Accuracy: 0.75
Precision: 0.65
Recall: 0.67


2. Classifying Iris Species Using Decision Trees

In [7]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
iris_data=pd.read_csv("IRIS.csv")
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
print("Missing Values:")
iris_data.isnull().sum()

Missing Values:


sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [9]:
# Encode categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [10]:
# Separate features and target variable
X = iris_data.drop('species', axis=1)
y = iris_data['species']
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Implement decision tree classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [12]:
# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Accuracy: {accuracy:.2f}')

Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.00


3. Predicting Titanic Survival Using Logistic Regression

In [13]:
from sklearn.metrics import roc_auc_score
titanic_data=pd.read_csv("titanic .csv")
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
titanic_data.shape

(891, 12)

In [15]:
print("Missing values:")
titanic_data.isnull().sum()

Missing values:


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
# Drop non-numeric columns
titanic_data = titanic_data.drop(['Name', 'Ticket','Cabin'], axis=1)

In [17]:
# Handle missing values
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
# Drop rows with missing values in 'Embarked' column
titanic_data = titanic_data.dropna(subset=['Embarked'])

In [18]:
print("Missing values:")
titanic_data.isnull().sum()

Missing values:


PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [19]:
# Encode categorical variables
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked'], drop_first=True)

In [20]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,True,False,True
1,2,1,1,38.0,1,0,71.2833,False,False,False
2,3,1,3,26.0,0,0,7.925,False,False,True
3,4,1,1,35.0,1,0,53.1,False,False,True
4,5,0,3,35.0,0,0,8.05,True,False,True


In [21]:
# Separate features and target variable
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

In [22]:
 # numerical features for standardization
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
X_numerical = X[numerical_features]

# Standardize numerical features
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical_features] = scaler.fit_transform(X_numerical)

In [23]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Implement logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

In [24]:
# Predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_pred_prob)

print(f'ROC-AUC: {roc_auc:.2f}')

ROC-AUC: 0.85


4. Classifying Emails as Spam Using Decision Trees

In [25]:
from sklearn.metrics import precision_score, recall_score, f1_score
email_data=pd.read_csv("emails.csv")
email_data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [26]:
print("Missing values:")
email_data.isnull().sum()

Missing values:


Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64

In [27]:
# Convert email identifiers to numeric
email_mapping = {email: idx+1 for idx, email in enumerate(email_data['Email No.'].unique())}
email_data['Email No.'] = email_data['Email No.'].map(email_mapping)

In [28]:
# Standardize features
scaler = StandardScaler()
X = email_data.drop('Prediction', axis=1)
y = email_data['Prediction']
X_scaled = scaler.fit_transform(X)

In [29]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Implement decision tree classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [30]:
# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

Precision: 0.88
Recall: 0.87
F1-Score: 0.87


5. Predicting Customer Satisfaction Using Logistic Regression

In [31]:
customer_data=pd.read_csv("customer_data.csv")
customer_data.head()

Unnamed: 0,Customer ID,Overall Delivery Experience (Rating),Food Quality (Rating),Speed of Delivery (Rating),Order Accuracy
0,1,5.0,3.0,4.0,Yes
1,2,3.0,4.0,3.0,Yes
2,3,4.0,5.0,2.0,Yes
3,4,5.0,3.0,4.0,Yes
4,5,2.0,5.0,1.0,Yes


In [32]:
customer_data.shape

(10616, 5)

In [33]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10616 entries, 0 to 10615
Data columns (total 5 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Customer ID                           10616 non-null  int64  
 1   Overall Delivery Experience (Rating)  10198 non-null  float64
 2   Food Quality (Rating)                 10364 non-null  float64
 3   Speed of Delivery (Rating)            10377 non-null  float64
 4   Order Accuracy                        9956 non-null   object 
dtypes: float64(3), int64(1), object(1)
memory usage: 414.8+ KB


In [34]:
print("Missing values:")
customer_data.isnull().sum()

Missing values:


Customer ID                               0
Overall Delivery Experience (Rating)    418
Food Quality (Rating)                   252
Speed of Delivery (Rating)              239
Order Accuracy                          660
dtype: int64

In [35]:
# Fill missing values in numerical columns with the median
customer_data.fillna(customer_data.median(numeric_only=True), inplace=True)
# Fill missing values in the target variable  with the mode
order_accuracy_mode = customer_data['Order Accuracy'].mode()[0]
customer_data['Order Accuracy'].fillna(order_accuracy_mode, inplace=True)

In [36]:
print("Missing values:")
customer_data.isnull().sum()

Missing values:


Customer ID                             0
Overall Delivery Experience (Rating)    0
Food Quality (Rating)                   0
Speed of Delivery (Rating)              0
Order Accuracy                          0
dtype: int64

In [37]:
# Separate features and target variable
X = customer_data.drop('Order Accuracy', axis=1)  
y = customer_data['Order Accuracy']               

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [38]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Implement logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [39]:
# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Accuracy: {accuracy:.2f}')

Confusion Matrix:
[[   0  566]
 [   5 1553]]
Accuracy: 0.73
