DAY 1: Conquering predictive models for numerical data

Import the required libraries and give the scripts the reference to the dataset

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report, accuracy_score
import umap

# Load the dataset
data = pd.read_csv('adult.csv')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())


Give summaries in order to make sure that this is a legitamte dataset

In [None]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Display dataset information
print("\nDataset Information:")
data.info()

# Display summary statistics
print("\nSummary Statistics:")
print(data.describe())


In [None]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

Separate the values for the each of the columns and add each instance of the value to their respective column

In [None]:
# Fill missing values with mode for categorical columns and median for numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Impute missing values
imputer = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_cols)
    ])

data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=numerical_cols.tolist() + categorical_cols.tolist())
print(data_imputed.columns)

# Encode categorical variables
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cols = encoder.fit_transform(data_imputed[categorical_cols])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(categorical_cols))

# Combine numerical and encoded categorical data
data_cleaned = pd.concat([data_imputed[numerical_cols], encoded_df], axis=1)


Make the heatmap; notice that the center red line is red because you are comparing the same values to each other 

In [None]:
# Visualize correlations
print("\nCorrelation Heatmap:")
plt.figure(figsize=(12, 8))
sns.heatmap(data_cleaned.corr(), cmap='coolwarm')
plt.show()


Initialize the age and marital status for comparison

In [None]:
label_encoder = LabelEncoder()
print(data_cleaned.columns)
data_cleaned['marital_status_encoded'] = label_encoder.fit_transform(data_imputed["marital.status"])

# Prepare features and target variable
X = data_cleaned[["age"]]
y = data_cleaned['marital_status_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Train a model to compare the two, and try to find a correlation (represented in the accuracy)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Actually calling the prediction of the marital status

In [None]:
def predict_marital_status(age):
    # Predict using the trained model
    predicted_encoded = model.predict([[age]])
    # Convert prediction back to original label
    predicted_status = label_encoder.inverse_transform(predicted_encoded)
    return predicted_status[0]

# Example usage
age_input = input("What age's marital status are you interested in: ")
predicted_status = predict_marital_status(age_input)
print(f'The predicted marital status for age {age_input} is {predicted_status}.')