<a href="https://colab.research.google.com/github/Hafsa620/PAI-LAB/blob/main/Sampling_techniques(LAB_10).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Random Undersampling

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
data = pd.read_csv('Titanic.csv')


def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])  # Fill missing values
    return X

# Custom function to create the FamilySize feature
def create_family_size(X):
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1  # Adding 1 for the individual themselves
    return X

# Custom function to drop specified columns
def drop_columns(X):
    return X.drop(['SibSp', 'Parch'], axis=1)

# Function to create FamilySize and drop SibSp and Parch columns
def family_size(X):
    X = create_family_size(X)
    X = drop_columns(X)
    return X

# Create pipelines for Age
age_pipeline = Pipeline(steps=[
    ('age_imputer', SimpleImputer(strategy='mean')),  # Impute Age
    ('age_scaler', MinMaxScaler())  # Scale Age
])

fare_pipeline = Pipeline(steps=[
    #('fare_imputer', SimpleImputer(strategy='mean')),  # Impute Fare
    ('fare_scaler', MinMaxScaler())  # Scale Fare
])

family_size_pipeline = Pipeline(steps=[
    ('family_size_creator', FunctionTransformer(family_size)),
    ('family_size_scaler', MinMaxScaler()),  # Scale Family_Size
])

embarked_pipeline = Pipeline(steps=[
    ('embarked_imputer', FunctionTransformer(impute_embarked)),  # Impute Embarked
    ('embarked_onehot', OneHotEncoder())  # One-hot encode Embarked
])

# Create a ColumnTransformer to preprocess the data
preprocessor = ColumnTransformer(transformers=[
    ('drop', 'drop', ['PassengerId', 'Name', 'Ticket', 'Cabin']),
    ('age_encoder', age_pipeline, ['Age']),
    ('fare_encoder', fare_pipeline, ['Fare']),
    ('family_size', family_size_pipeline, ['SibSp', 'Parch']),  # Process FamilySize
    ('embarked_encoder', embarked_pipeline, ['Embarked']),
    ('sex_encoder', OneHotEncoder(), ['Sex']),
    ('scaler', MinMaxScaler(), ['Pclass']),  # Scale Pclass
], remainder='passthrough')

# Create a complete pipeline that includes preprocessing and the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing steps
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # KNN Classifier
])

X = data.drop('Survived', axis=1)
y = data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Fit the pipeline on the training data
pipeline.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Step 8: Evaluate the Model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Model Accuracy: 0.79
Confusion Matrix:
[[83 22]
 [16 58]]


Random Oversampling

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
data = pd.read_csv('Titanic.csv')


def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])  # Fill missing values
    return X

# Custom function to create the FamilySize feature
def create_family_size(X):
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1  # Adding 1 for the individual themselves
    return X

# Custom function to drop specified columns
def drop_columns(X):
    return X.drop(['SibSp', 'Parch'], axis=1)

# Function to create FamilySize and drop SibSp and Parch columns
def family_size(X):
    X = create_family_size(X)
    X = drop_columns(X)
    return X

# Create pipelines for Age
age_pipeline = Pipeline(steps=[
    ('age_imputer', SimpleImputer(strategy='mean')),  # Impute Age
    ('age_scaler', MinMaxScaler())  # Scale Age
])

fare_pipeline = Pipeline(steps=[
    #('fare_imputer', SimpleImputer(strategy='mean')),  # Impute Fare
    ('fare_scaler', MinMaxScaler())  # Scale Fare
])

family_size_pipeline = Pipeline(steps=[
    ('family_size_creator', FunctionTransformer(family_size)),
    ('family_size_scaler', MinMaxScaler()),  # Scale Family_Size
])

embarked_pipeline = Pipeline(steps=[
    ('embarked_imputer', FunctionTransformer(impute_embarked)),  # Impute Embarked
    ('embarked_onehot', OneHotEncoder())  # One-hot encode Embarked
])

# Create a ColumnTransformer to preprocess the data
preprocessor = ColumnTransformer(transformers=[
    ('drop', 'drop', ['PassengerId', 'Name', 'Ticket', 'Cabin']),
    ('age_encoder', age_pipeline, ['Age']),
    ('fare_encoder', fare_pipeline, ['Fare']),
    ('family_size', family_size_pipeline, ['SibSp', 'Parch']),  # Process FamilySize
    ('embarked_encoder', embarked_pipeline, ['Embarked']),
    ('sex_encoder', OneHotEncoder(), ['Sex']),
    ('scaler', MinMaxScaler(), ['Pclass']),  # Scale Pclass
], remainder='passthrough')

# Create a complete pipeline that includes preprocessing and the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing steps
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # KNN Classifier
])

X = data.drop('Survived', axis=1)
y = data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Random oversampling
rus = RandomOverSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Fit the pipeline on the training data
pipeline.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Step 8: Evaluate the Model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Model Accuracy: 0.78
Confusion Matrix:
[[80 25]
 [14 60]]


SMOTE Oversampling

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline # Import Pipeline from imblearn

# Load the dataset
data = pd.read_csv('Titanic.csv')


def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])  # Fill missing values
    return X

# Custom function to create the FamilySize feature
def create_family_size(X):
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1  # Adding 1 for the individual themselves
    return X

# Custom function to drop specified columns
def drop_columns(X):
    return X.drop(['SibSp', 'Parch'], axis=1)

# Function to create FamilySize and drop SibSp and Parch columns
def family_size(X):
    X = create_family_size(X)
    X = drop_columns(X)
    return X

# Create pipelines for Age
age_pipeline = Pipeline(steps=[
    ('age_imputer', SimpleImputer(strategy='mean')),  # Impute Age
    ('age_scaler', MinMaxScaler())  # Scale Age
])

fare_pipeline = Pipeline(steps=[
    ('fare_scaler', MinMaxScaler())  # Scale Fare
])

family_size_pipeline = Pipeline(steps=[
    ('family_size_creator', FunctionTransformer(family_size)),
    ('family_size_scaler', MinMaxScaler()),  # Scale Family_Size
])

embarked_pipeline = Pipeline(steps=[
    ('embarked_imputer', FunctionTransformer(impute_embarked)),  # Impute Embarked
    ('embarked_onehot', OneHotEncoder())  # One-hot encode Embarked
])

# Create a ColumnTransformer to preprocess the data
preprocessor = ColumnTransformer(transformers=[
    ('drop', 'drop', ['PassengerId', 'Name', 'Ticket', 'Cabin']),
    ('age_encoder', age_pipeline, ['Age']),
    ('fare_encoder', fare_pipeline, ['Fare']),
    ('family_size', family_size_pipeline, ['SibSp', 'Parch']),  # Process FamilySize
    ('embarked_encoder', embarked_pipeline, ['Embarked']),
    ('sex_encoder', OneHotEncoder(), ['Sex']),
    ('scaler', MinMaxScaler(), ['Pclass']),  # Scale Pclass
], remainder='passthrough')

# Create a complete pipeline that includes preprocessing and the classifier
# Use imblearn.pipeline.Pipeline to allow for SMOTE
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # SMOTE is now a valid step
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

X = data.drop('Survived', axis=1)
y = data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Step 8: Evaluate the Model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Model Accuracy: 0.79
Confusion Matrix:
[[82 23]
 [14 60]]


Heart disease

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
data=pd.read_csv('heart_2020_cleaned.csv')

from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer # Import SimpleImputer for missing values

label_encoder=LabelEncoder()

age_pipeline=Pipeline(steps=[('age_encoder',OrdinalEncoder()),
                             ('scaled_age',MinMaxScaler())])

genhealth_pipeline=Pipeline(steps=[('health_encoder',OrdinalEncoder()),
                                   ('scaled_health',MinMaxScaler())])


preprocessing=ColumnTransformer(transformers=[('drop','drop',['Race','Asthma','SkinCancer']),
                                             ('scaled_features',MinMaxScaler(),['BMI','PhysicalHealth','MentalHealth','SleepTime']),
                                             ('ohe',OneHotEncoder(),['Sex','Smoking','AlcoholDrinking','Stroke','DiffWalking','Diabetic','PhysicalActivity','KidneyDisease']),
                                              ('age_encdoer',age_pipeline,['AgeCategory']),
                                              ('health_encoder',genhealth_pipeline,['GenHealth'])
                                            ],remainder='passthrough')

# Add SimpleImputer to handle missing values before SMOTE
pipeline=Pipeline(steps=[('preprocessing',preprocessing),
                         ('smote', SMOTE(random_state=42)),
                         ('model',KNeighborsClassifier(n_neighbors=5))])

X=data.drop('HeartDisease',axis=1)
y=data['HeartDisease']

y=label_encoder.fit_transform(y)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred=pipeline.predict(X_test)

print(f"Classification_report:{classification_report(y_pred,y_test)}")
print(f"Confusion_matrix:{confusion_matrix(y_pred,y_test)}")

Classification_report:              precision    recall  f1-score   support

           0       0.80      0.95      0.87     49412
           1       0.55      0.21      0.31     14547

    accuracy                           0.78     63959
   macro avg       0.68      0.58      0.59     63959
weighted avg       0.75      0.78      0.74     63959

Confusion_matrix:[[46903  2509]
 [11464  3083]]
