Class Project 1

In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Data 
url = "https://raw.githubusercontent.com/drolsonmi/math3480/refs/heads/main/datasets/young-people-survey-responses.csv"
df = pd.read_csv(url)

# Drop variables that are not necessary
unnecessary_columns = ['Village - town']  
df.drop(columns=unnecessary_columns, inplace=True, errors='ignore')

#Missing Values
missing_threshold = 0.10 
for col in df.columns:
    missing_ratio = df[col].isnull().mean()
    if missing_ratio > missing_threshold:
        df.drop(columns=[col], inplace=True)
    elif missing_ratio > 0:
        if df[col].dtype in [np.float64, np.int64]:
            df[col].fillna(df[col].median(), inplace=True) 
        else:
            df[col].fillna(df[col].mode()[0], inplace=True) 

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_transformed = encoder.fit_transform(df[categorical_cols])
categorical_df = pd.DataFrame(categorical_transformed, columns=encoder.get_feature_names_out(categorical_cols))

df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, categorical_df], axis=1)

#Cross Validation (Divide data into training and testing sets)
target = 'Loneliness'
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values