# Labour Earning Prediction

**Problem Statement:**  
Predict **labor earnings for the year 1978** using **demographic and socio-economic features** from the years **1974 and 1975**.

---

**By:** *Mowlick Armstrong*

####  Step 1: Import Required Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

#### Step 2: Load and Explore the Dataset.

In [None]:
# Load the dataset
df = pd.read_csv("LabourTrainingEvaluationData.csv")

# Show the first 5 rows
print(df.head())

# Display column information and check for null values
print("\nDataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

#### Step 3: Data Cleaning & Preprocessing.

In [None]:
# Drop rows with missing values
df = df.dropna()

# Binary Encoding: race (0 = not black, 1 = black)
df['Race'] = df['Race'].map({0: 0, 1: 1})  # already binary, keeping as-is

# Binary Encoding: hispanic
df['Hisp'] = df['Hisp'].map({0: 0, 1: 1})  # assuming already binary

# Binary Encoding: married
df['MaritalStatus'] = df['MaritalStatus'].map({0: 0, 1: 1})  # assuming already binary

# View dataset after cleaning
print(df.head())

#### Step 4: Define Features and Target Variable.

In [None]:
# Label encode the 'educ' column
df['Eduacation'] = df['Eduacation'].astype('category').cat.codes

# Check the unique codes mapped to education levels
print("Education Levels Encoded:")
print(df['Eduacation'].unique())


In [None]:
# Define the feature columns and target column
feature_cols = ['Age', 'Eduacation', 'Race', 'Hisp', 'MaritalStatus', 'Earnings_1974', 'Earnings_1975']
target_col = 'Earnings_1978'

# Create features (X) and target (y)
X = df[feature_cols]
y = df[target_col]

# Display the shapes
print("Feature shape:", X.shape)
print("Target shape:", y.shape)


#### Step 5: Feature Scaling

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Step 1: Reload the original dataset
df = pd.read_csv("LabourTrainingEvaluationData.csv")

# Step 2: Drop columns that are entirely null (Race, Hisp, MaritalStatus)
df = df.drop(columns=['Race', 'Hisp', 'MaritalStatus'], errors='ignore')  # safely ignore if already dropped

# Step 3: Encode 'Eduacation'
df['Eduacation'] = df['Eduacation'].astype('category').cat.codes

# Step 4: Drop rows that have missing values in required columns
required_cols = ['Age', 'Eduacation', 'Earnings_1974', 'Earnings_1975', 'Earnings_1978']
df = df.dropna(subset=required_cols)

# Step 5: Define features and target
feature_cols = ['Age', 'Eduacation', 'Earnings_1974', 'Earnings_1975']
target_col = 'Earnings_1978'

X = df[feature_cols]
y = df[target_col]

# Step 6: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Check
print(" Reload successful. Final shape:", X.shape)


#### Step 6: Train-Test Split 

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Check shapes
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


#### Step 7: Train the Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Check the learned coefficients
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

#### Step 8: Make Predictions and Evaluate the Model

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate R² Score
r2 = r2_score(y_test, y_pred)

# Display results
print("Model Evaluation:")
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)