In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pickle

In [None]:
penguins_df = pd.read_csv("noisy_datasets/penguins.csv")
diamonds_df = pd.read_csv("noisy_datasets/diamond.csv")
emissions_df = pd.read_csv("noisy_datasets/emissions_by_country.csv")

print("Penguin Data \n", penguins_df.describe())
print("\nDiamond Data \n", diamonds_df.describe())
print("\nEmissions Dataset\n", emissions_df.describe())
emissions_df.info()

In [None]:
penguins_df.info()
penguins_df = penguins_df.dropna(subset=['species', 'island', 'gender'])
penguins_df = penguins_df.fillna(penguins_df.select_dtypes(include='number').mean())
penguins_df[['species', 'island', 'gender']] = penguins_df[['species', 'island', 'gender']].applymap(str.lower)
penguins_df = penguins_df.dropna(subset=['species', 'island', 'gender'])
penguins_df = penguins_df.fillna(penguins_df.select_dtypes(include='number').mean())
penguins_df[['species', 'island', 'gender']] = penguins_df[['species', 'island', 'gender']].applymap(str.lower)
Q1 = penguins_df.quantile(0.25)
Q3 = penguins_df.quantile(0.75)
IQR = Q3 - Q1
outliers = (penguins_df < (Q1 - 1.5 * IQR)) | (penguins_df > (Q3 + 1.5 * IQR))
for column in penguins_df.select_dtypes(include='number').columns:
    penguins_df[column] = penguins_df[column].wy_y_hatatere(~outliers[column], penguins_df[column].median())


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='gender', y='body_mass_g', data=penguins_df, estimator=np.mean, palette='muted')
plt.title('Average Body Mass Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Average Body Mass (g)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='species', y='bill_length_mm', data=penguins_df, estimator=np.mean, palette='muted')
plt.title('Average Bill Length by Species')
plt.xlabel('Species')
plt.ylabel('Average Bill Length (mm)')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='species', y='calorie requirement', data=penguins_df, palette='muted')
plt.title('Calorie Requirement Distribution by Species')
plt.xlabel('Species')
plt.ylabel('Calorie Requirement')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='island', hue='gender', data=penguins_df, palette='muted')
plt.title('Gender Distribution by Island')
plt.xlabel('Island')
plt.ylabel('Count')
plt.show()


In [None]:
correlation_matrix = penguins_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.show()

In [None]:
penguins_df = pd.get_dummies(penguins_df, columns=['species', 'island'], drop_first=False)
penguins_df['gender_target'] = penguins_df['gender'].map({'female': 0, 'male': 1})
correlation_matrix = penguins_df.corr()
target_correlation = correlation_matrix['gender_target']
print(target_correlation)
threshold = 0.1 
low_correlation_features = target_correlation[target_correlation.abs() < threshold].index.tolist()
print(low_correlation_features)
corelated_df = penguins_df.drop(columns=low_correlation_features)
corelated_df.head(5)

In [22]:
non_categorical_columns = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
min_values = corelated_df[non_categorical_columns].min()
max_values = corelated_df[non_categorical_columns].max()
corelated_df[non_categorical_columns] = (corelated_df[non_categorical_columns] - min_values) / (max_values - min_values)


In [None]:
corelated_df.info()

In [None]:
class Logit_Regression():
    def __init__(self, learning_rate, iterations_count, weights):
        self.learning_rate = learning_rate
        self.iterations_count = iterations_count
        self.weights = weights
        self.loss = []
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def cost(self, y, y_hat):
        N = len(y)
        return (1/N) * np.sum(-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat))
    
    def gradient_descent(self,X, y):
        N = len(y)
        self.bias = 1
        z = np.dot(self.weights.T, X) + self.bias
        y_hat = self.sigmoid(z)
        delta = y_hat - y
        x_trans = np.transpose(X)
        dW = (x_trans*delta)/N
        w = w - self.learning_rate*dW
        return w, y_hat

    def fit(self,X,y):
        best_weights = None
        best_bias = None
        best_acc = 0
        for i in range(self.iterations_count):
            self.weights, y_hat = self.gradient_descent(X,y)
            c = self.cost(y, y_hat)
            self.loss.append(c)
            y_hat = self.predict(X)
            acc = self.accuracy(y,y_hat)
            if acc > best_acc:
                best_acc = acc
                best_weights = self.weights
                best_bias = self.bias
            
            print(f"Iteration {i}:\nLoss is {c}\nAccuracy is {acc}\n")
        
        updated_weights = {'weights': best_weights, 'bias': best_bias}
        with open('best_weights_file.pkl', 'wb') as f:
            pickle.dump(updated_weights, f)
        print("Best Accuracy during training: ", best_acc * 100)

    def predict(self, X):
        self.bias = 1
        weights_trans = np.transpose(self.weights)
        z = np.dot(weights_trans, X) + self.bias
        y_hat = self.sigmoid(z)
        return y_hat
    
    def accuracy(self, y, y_hat):
        y_hat_bin = (y_hat > 0.5).astype(int)
        y_bin = (y > 0.5).astype(int)
        predictions = np.sum(y_hat_bin == y_bin)
        acc = predictions / len(y)
        return acc




In [None]:
X = corelated_df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']].values
y = corelated_df['gender_target'].values.reshape(-1, 1) 
N = X.shape[0] 
train_size = int(0.8 * N) 
index_number = np.arange(N)
np.random.shuffle(index_number)
train_indices = index_number[:train_size]
test_indices = index_number[train_size:]
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
weights = np.random.uniform(0, 1)
learning_rate = 0.001
iterations_count = 1000000
model = Logit_Regression(learning_rate, iterations_count, weights)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_accuracy = model.evaluate_accuracy(y_test, y_pred)
print(f"Test Accuracy: ",test_accuracy * 100)
model.plot_loss()
with open('best_model_weights.pkl', 'rb') as f:
    saved_model = pickle.load(f)
    print(f"Saved Weights: ",saved_model['weights'])
    print(f"Saved Bias: ",saved_model['bias'])

print("hahaha")