In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [16]:
data = pd.read_csv('weight-height.csv')

data

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.042470
4,Male,69.881796,206.349801
...,...,...,...
9995,Female,66.172652,136.777454
9996,Female,67.067155,170.867906
9997,Female,63.867992,128.475319
9998,Female,69.034243,163.852461


In [17]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

y = data['Gender']
Xs = data.drop(columns=['Gender', 'Height'])  # Gender is the target variable

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, 50, 60],
    'min_samples_split': [10, 20, 30, 40, 50],
    'min_impurity_decrease': [0, 0.001, 0.005, 0.01, 0.05, 0.1]
}

gridSearch = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid,
    cv=3, 
    scoring='recall',
    verbose=1
)

gridSearch.fit(Xs, y)
print('Best Score:', gridSearch.best_score_)
print('Initial parameters:', gridSearch.best_params_)


Fitting 3 folds for each of 360 candidates, totalling 1080 fits


Best Score: 0.887800111046218
Initial parameters: {'criterion': 'gini', 'max_depth': 40, 'min_impurity_decrease': 0, 'min_samples_split': 40}


2. With Random Sampling

In [20]:
data_rs = data.sample(1000, random_state=1) 

y = data['Gender']
Xs = data.drop(columns=['Gender', 'Height'])  # Gender is the target variable

gridSearch = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid,
    cv=3, 
    scoring='recall',
    verbose=1
)

gridSearch.fit(Xs, y)
print('Initial Score:', gridSearch.best_score_)
print('Initial parameters:', gridSearch.best_params_)


Fitting 3 folds for each of 360 candidates, totalling 1080 fits
Initial Score: 0.887800111046218
Initial parameters: {'criterion': 'gini', 'max_depth': 40, 'min_impurity_decrease': 0, 'min_samples_split': 40}


3. Building a model - decision tree

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = data.drop(columns=['Gender', 'Height']) 
y = data['Gender'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
best_params = {
    'criterion': 'gini',
    'max_depth': 40,
    'min_impurity_decrease': 0,
    'min_samples_split': 40
}

dt_classifier = DecisionTreeClassifier(**best_params)


dt_classifier.fit(X_train, y_train)

y_pred = dt_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Printing the classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.8905
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       988
           1       0.90      0.89      0.89      1012

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000



In [1]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.datasets import load_iris
import pandas as pd

# Load an example dataset (replace this with your own dataset)
iris = load_iris()
X, y = iris.data, iris.target

# Convert the data to a DataFrame for demonstration purposes
columns = [f"Feature_{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=columns)
df['Target'] = y

# Separate features and target variable
X = df.drop('Target', axis=1)
y = df['Target']

# Initialize the RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Fit and transform the data
X_resampled, y_resampled = ros.fit_resample(X, y)

# Check the number of samples in each class after oversampling
print("Original class distribution:", y.value_counts().to_dict())
print("Resampled class distribution:", pd.Series(y_resampled).value_counts().to_dict())

Original class distribution: {0: 50, 1: 50, 2: 50}
Resampled class distribution: {0: 50, 1: 50, 2: 50}
