In [3]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import os

In [3]:
# Define dataset file paths
train_file = r"C:\Users\emads\OneDrive\Desktop\ADULT_PJ\Adult\adult.data"
test_file = r"C:\Users\emads\OneDrive\Desktop\ADULT_PJ\Adult\adult.test"

In [14]:
# Verify files exist
if not os.path.exists(train_file) or not os.path.exists(test_file):
    raise FileNotFoundError("Dataset files not found. Check your file paths.")


In [None]:
# Define column names
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]

In [2]:
# Load datasets
df_train = pd.read_csv(train_file, names=columns, na_values=" ?", skipinitialspace=True)
df_test = pd.read_csv(test_file, names=columns, na_values=" ?", skipinitialspace=True, skiprows=1)


NameError: name 'pd' is not defined

In [None]:
# Load the testing dataset (skip the first row as it contains headers in text)
df_test = pd.read_csv(test_file, names=columns, na_values=" ?", skipinitialspace=True, skiprows=1)


In [None]:
# Combine both datasets
df = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
# Clean data by dropping missing values
df.dropna(inplace=True)

In [None]:
# Standardize income labels (some test data may have a period at the end)
df["income"] = df["income"].str.replace(".", "", regex=False)

In [None]:
# Create a directory to save visualizations
viz_folder = "visualizations"
os.makedirs(viz_folder, exist_ok=True)

In [1]:
### 📊 1. Income Distribution by Age
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x="age", hue="income", kde=True, bins=30)
plt.title("Income Distribution by Age")
plt.savefig(f"{viz_folder}/income_distribution_by_age.png")  # Save figure
plt.show()

NameError: name 'plt' is not defined

In [None]:
# 2. Income by Workclass
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x="workclass", hue="income", order=df["workclass"].value_counts().index)
plt.title("Income by Workclass")
plt.xticks(rotation=90)
plt.savefig(f"{viz_folder}/income_by_workclass.png")
plt.show()
plt.close()

In [None]:
# 3. Income vs. Hours Worked Per Week
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="income", y="hours_per_week")
plt.title("Income vs. Hours Worked Per Week")
plt.savefig(f"{viz_folder}/income_vs_hours_worked.png")
plt.show()
plt.close()

In [None]:
# 4. Scatter Plot: Age vs. Hours Worked per Week (Interactive)
fig = px.scatter(df, x='age', y='hours_per_week', color='income', 
                 title='Age vs. Hours Worked Per Week (Interactive)',
                 labels={'age': 'Age', 'hours_per_week': 'Hours Worked Per Week'})
fig.write_html(f"{viz_folder}/age_vs_hours_worked.html")  # Save interactive plot
fig.show()


In [None]:
# 5. Hours Worked Per Week by Education Level (Interactive)
fig = px.box(df, x='education', y='hours_per_week', color='income', 
             title='Hours Worked Per Week by Education Level', 
             labels={'education': 'Education Level', 'hours_per_week': 'Hours Worked Per Week'}, 
             height=700, width=1200)
fig.write_html(f"{viz_folder}/hours_by_education.html")  # Save interactive plot
fig.show()

In [None]:
# 6. Heatmap: Correlation Matrix of Numeric Features
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix of Numeric Features")
plt.savefig(f"{viz_folder}/correlation_matrix.png")
plt.show()
plt.close()

In [None]:
# 7. Pie Chart: Distribution of Income Levels (Interactive)
fig = px.pie(df, names='income', title='Income Level Distribution')
fig.write_html(f"{viz_folder}/income_pie_chart.html")  # Save interactive plot
fig.show()


In [None]:
print(" All visualizations have been saved and displayed successfully.")