## code for importing an excel spreadsheet into python

In [None]:
!pip install pandas openpyxl

In [None]:
import pandas as pd

df = pd.read_excel("adult.xlsx")
df


## look at basic info

In [None]:
# column names
df.columns

In [None]:
#data shape
df.shape

In [None]:
#data types
df.dtypes

## look at missing values

In [None]:
df.isna().sum()

In [None]:
(df == '?').sum() # because in this particular data set some missing are coded as ?


In [None]:
df = df.replace('?', pd.NA) # replace "?" with proper missing values


## explore numerical features

In [None]:
df.describe() # summary stats


## histogram of age

In [None]:
import matplotlib.pyplot as plt

df['age'].hist()
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()


In [None]:
# scatterplot - age vs hours
df.plot(kind='scatter', x='age', y='hours.per.week')


## explore categoricals

In [None]:
df['income'].value_counts()

In [None]:
# "normalize" to a percentage (not really normalizing in the statistical sense)
df['income'].value_counts(normalize=True)


In [None]:
# bar plot
df['workclass'].value_counts().plot(kind='bar')


## grouping / summary tables

In [None]:
df.groupby('marital.status')['hours.per.week'].mean()

In [None]:
df.groupby('marital.status')['hours.per.week'].median()

In [None]:
df.groupby('marital.status')['hours.per.week'].describe()

In [None]:
df.groupby('income')['hours.per.week'].mean()

In [None]:
df.groupby('education')['capital.gain'].mean().sort_values(ascending=False)

In [None]:
# create a new variable for over 50 and works > 50 hours per week
df['over50_and_workhard'] = (
    (df['age'] > 50) &
    (df['hours.per.week'] > 50)
).astype(int)

df[['age', 'hours.per.week', 'over50_and_workhard']].head()



## correlation matrix

In [None]:
df.corr(numeric_only=True)

In [None]:
!pip install pandas seaborn

In [None]:
# heat map
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True), annot=False, cmap="viridis")
plt.show()


## one step final summary

In [None]:
from skimpy import skim

skim(df)



In [None]:
from skimpy import skim

skim(df)


## simple machine learning 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df = df.replace('?', pd.NA)
df = df.dropna()  # simple cleanup

In [None]:
# set up x (features) and y (target)
X = df.drop('income', axis=1)
y = (df['income'] == '>50K').astype(int)

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [None]:
# model preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('logreg', LogisticRegression(max_iter=5000))
])


In [None]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
model.predict(df.iloc[[0]])

In [None]:
model.predict_proba(df.iloc[[0]])

In [None]:
# Step 1: extract the trained OneHotEncoder from the pipeline
ohe = model.named_steps['preprocess'].named_transformers_['cat']

# Step 2: get the expanded names for categorical features (after one-hot encoding)
ohe_feature_names = ohe.get_feature_names_out(cat_cols)

# Step 3: numeric column names stay the same
all_feature_names = list(ohe_feature_names) + list(num_cols)


In [None]:
log_reg = model.named_steps['logreg']
coefficients = log_reg.coef_[0]   # 1D array for binary classification


In [None]:
import pandas as pd

coef_df = pd.DataFrame({
    'feature': all_feature_names,
    'coefficient': coefficients
})


In [None]:
coef_df['abs_coef'] = coef_df['coefficient'].abs()
coef_df_sorted = coef_df.sort_values('abs_coef', ascending=False)
coef_df_sorted.head(20)


In [None]:
import matplotlib.pyplot as plt

top = coef_df_sorted.head(20)
plt.figure(figsize=(10,6))
plt.barh(top['feature'], top['coefficient'])
plt.gca().invert_yaxis()
plt.title("Top Logistic Regression Coefficients")
plt.show()


## random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

In [None]:
rf_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('rf', RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

n_estimators=200 → number of trees (200 is a good default)

random_state=42 → reproducibility

n_jobs=-1 → use all CPU cores to speed up training

In [None]:
rf_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
rf = rf_model.named_steps['rf']
importances = rf.feature_importances_

In [None]:
ohe = rf_model.named_steps['preprocess'].named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(cat_cols)
all_feature_names = list(ohe_feature_names) + list(num_cols)

import pandas as pd
fi = pd.DataFrame({
    'feature': all_feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

fi.head(20)


In [None]:
from sklearn.tree import export_graphviz
import graphviz

estimator = rf_model.named_steps['rf'].estimators_[0]

dot_data = export_graphviz(
    estimator,
    out_file=None,
    feature_names=all_feature_names,
    class_names=['<=50K', '>50K'],
    filled=True,
    rounded=True,
    max_depth=3
)

graph = graphviz.Source(dot_data)

# Export as PNG
graph.render("decision_tree", format="png", cleanup=True)

graph  # still displays inline if supported
