In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import umap

# Load the dataset
data = pd.read_csv('adult.csv')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())


Matplotlib is building the font cache; this may take a moment.


First few rows of the dataset:
   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

          occupation   relationship   race     sex  capital.gain  \
0                  ?  Not-in-family  White  Female             0   
1    Exec-managerial  Not-in-family  White  Female             0   
2                  ?      Unmarried  Black  Female             0   
3  Machine-op-inspct      Unmarried  White  Female             0   
4     Prof-specialty      Own-child  White  Female             0   

   capital.loss  hours.per.week native.country income  
0          4356              40  United-States  <=50K  
1          4356

In [2]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Display dataset information
print("\nDataset Information:")
data.info()

# Display summary statistics
print("\nSummary Statistics:")
print(data.describe())


First few rows of the dataset:
   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

          occupation   relationship   race     sex  capital.gain  \
0                  ?  Not-in-family  White  Female             0   
1    Exec-managerial  Not-in-family  White  Female             0   
2                  ?      Unmarried  Black  Female             0   
3  Machine-op-inspct      Unmarried  White  Female             0   
4     Prof-specialty      Own-child  White  Female             0   

   capital.loss  hours.per.week native.country income  
0          4356              40  United-States  <=50K  
1          4356

In [3]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64


In [4]:
# Fill missing values with mode for categorical columns and median for numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Impute missing values
imputer = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_cols)
    ])

data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=numerical_cols.tolist() + categorical_cols.tolist())

# Encode categorical variables
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_cols = encoder.fit_transform(data_imputed[categorical_cols])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(categorical_cols))

# Combine numerical and encoded categorical data
data_cleaned = pd.concat([data_imputed[numerical_cols], encoded_df], axis=1)


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [None]:
# Visualize data distributions
print("\nData Distributions:")
data_cleaned.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

# Visualize correlations
print("\nCorrelation Heatmap:")
plt.figure(figsize=(12, 8))
sns.heatmap(data_cleaned.corr(), annot=True, cmap='coolwarm')
plt.show()


In [None]:
# Define features and target variable
# Assume 'target_column_name' is the column to be predicted. Replace with actual column name.
target_column_name = 'target_column_name'  # replace with actual target column
X = data_cleaned.drop(target_column_name, axis=1)
y = data_cleaned[target_column_name]


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define a function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Model Performance:\n MSE: {mse}\n R2: {r2}')

# Train Linear Regression Model
print("\nTraining Linear Regression Model...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print('Linear Regression Model Performance:')
evaluate_model(lr_model, X_test, y_test)

In [None]:
# Train Decision Tree Model
print("\nTraining Decision Tree Model...")
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
print('Decision Tree Model Performance:')
evaluate_model(dt_model, X_test, y_test)


In [None]:
# UMAP Visualization
print("\nUMAP Visualization:")
reducer = umap.UMAP()
embedding = reducer.fit_transform(data_cleaned)

plt.figure(figsize=(10, 7))
plt.scatter(embedding[:, 0], embedding[:, 1], c=y, cmap='Spectral', s=5)
plt.colorbar()
plt.title('UMAP projection of the dataset')
plt.show()