# Data Description
The data contains 80 variables that characterize the demographic and socio-economic situation of 181
galaxies over a period of at most 26 years. A composite index is given that measures their well-being.
However, the demographic and socio-economic variables that influence this index is not known. We
seek to determine, what makes the galaxies better off?
#### We would like you to use the data and:
1. Tell us which variables best explain the variance of the well-being index
2. Determine the future well-being values of the galaxies
## Submission Instructions and Format
We have provided you data with observed values of the well-being index of each galaxy and a
validation dataset that requires the prediction of the future well-being index.
Kindly submit:
1. A report that discusses the demographic and socio-economic determinants of the galaxies'
wellbeing.
Submission Format: The report should be a pdf of a slide presentation of not more than 5 slides
2. The predicted future well-being index values with the highest possible level of certainty using
data in the validation dataset.
Submission Format: A csv file Saved as "firstname_lastname_DSA.csv" containing:
Variable Description
ID Unique identifier of the observations in the validation dataset
Predicted Well-Being Index Prediction for the Well-Being Index
3. Analysis file
Submission Format: python/R notebook with detailed comments and organized analysis of EDA with
visualizations, well documented analytical process and test results. Kindly follow reproducibility

## Submission Evaluation Criteria
The submission will be evaluation against the following criteria:
1. The reports will be assessed on the quality of the presentation of the findings: narrative and
visualizations
2. The future predictions of the well-being index will be evaluated using RMSE metric
3. . The reproducibility of your codebase


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#Package for graphical analysis of missingvalues
import missingno as msno
# Libraries for data preparation and model building
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import Lasso
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import make_scorer, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import export_text
import pickle




: 

In [None]:
def load_dataset(file_path):
    """
    Loads a dataset from a file path using pandas library
    
    Parameters:
    -----------
    file_path : str
        The path to the dataset file

    Returns:
    --------
    pandas.DataFrame
        The loaded dataset as a pandas dataframe
    """
    try:
        # Set the max_columns parameter to None
        pd.set_option('display.max_columns', None)
        dataset = pd.read_csv(file_path)
        print(f"Dataset loaded successfully from {file_path}")
        return dataset
    except Exception as e:
        print(f"Error loading dataset from {file_path}: {str(e)}")


: 

In [None]:
train_data = load_dataset("datasets/Train_data.csv")
test_data = load_dataset("datasets/Validation.csv")
test_data.head()

: 

In [None]:
def explore_dataframe(df):
    """
    This function explores the data structure of a pandas dataframe
    using various methods including df.info(), df.describe(), 
    df.nunique(), df.duplicated(), and df.isnull().sum().
    
    Args:
    df (pandas.DataFrame): The dataframe to be explored.
    
    Returns:
    None.
    """
    print("DataFrame Information:\n")
    df.info()
    
    print("\n\nDataFrame Description:\n")
    print(df.describe().T)
    
    print("\n\nUnique Values:\n")
    for col in df.columns:
        if df[col].nunique() < 10:
            print(col, ":", df[col].unique())
    
    print("\n\nDuplicated Rows:\n")
    print(df[df.duplicated()])
    
    print("\n\nMissing Values:\n")
    print(df.isnull().sum())

train_data_structure = explore_dataframe(train_data)

: 

After briefly looking through the data, notice that some entries are missing.

We will determine the number of missing entries for a specified column in the dataset. We will also plot a bar graph and a matrix plot to visualize it

In [None]:
def total_missing(df,column_name):
    missing = df[column_name].isnull().sum()
    return column_name +" has " + str(missing)+" missing values"

# how many issing values for the column: Population, urban (%) 
total_missing(train_data,'Population, urban (%)')

: 

The above column tested has more than half the data missing

In [None]:
def plot_missing_data(df):
    # Create matrix plot
    msno.matrix(df)
    plt.show()
    
    # Create bar plot
    msno.bar(df)
    plt.show()

: 

It would be a good idea to replace some of the missing data. Missing values can be replaced with the either the mean , the median or the mode (in the case of categorical columns). However, based on the data we have columns that have extremely high percentage of missing values(more than half of the records). I might drop sme of the columns to avoid bias when training our set for predictions

In [None]:
def drop_missing_columns(df, n):
    """
    Drops columns with too many missing values from a DataFrame and prints
    the resulting information about the DataFrame.
    
    Arguments:
    train_data -- pandas DataFrame
    
    Returns:
    None
    """
    # Compute the threshold for dropping columns
    threshold = df.shape[0] - n
    
    # Drop columns with too many missing values
    df.dropna(axis=1, thresh=threshold, inplace=True)
    
    # Print the information about the resulting DataFrame
    print(df.info())

clean_train_df = train_data.copy() 
drop_missing_columns(clean_train_df,1000)    


: 

In [None]:
plot_missing_data(clean_train_df)

: 

In [None]:
def impute_missing_values(df, impute_method):
    '''
    Imputes missing values in a DataFrame using mean, mode, or median for each column.
    
    Parameters:
        df (pandas.DataFrame): DataFrame containing missing values
        impute_method (str): 'mean', 'mode', or 'median'
        
    Returns:
        pandas.DataFrame: DataFrame with imputed values
    '''
    if impute_method not in ['mean', 'mode', 'median']:
        raise ValueError("impute_method must be 'mean', 'mode', or 'median'")
    
    # Make a copy of the DataFrame to avoid modifying the original
    df_imputed = df.copy()
    
    # Loop over columns with missing values and impute based on the specified method
    for col in df_imputed.columns[df_imputed.isnull().any()]:
        if impute_method == 'mean':
            df_imputed[col].fillna(df_imputed[col].mean(), inplace=True)
        elif impute_method == 'mode':
            df_imputed[col].fillna(df_imputed[col].mode()[0], inplace=True)
        elif impute_method == 'median':
            df_imputed[col].fillna(df_imputed[col].median(), inplace=True)
    
    return df_imputed

median_imputed_df = impute_missing_values(clean_train_df, 'median')
median_imputed_df.info()


: 

In [None]:
def plot_numeric_histograms(df):
    num_cols = list(df.select_dtypes(include=['float64', 'int64']).columns)
    df[num_cols].plot(kind='density', subplots=True, layout=(4, 3), sharex=False, figsize=(20, 15))
    df[num_cols].hist(figsize=(20,15))

plot_numeric_histograms(median_imputed_df)        

: 

In [None]:
def plot_boxplot(df):
    num_cols = len(df.select_dtypes(include=['float64', 'int64']).columns)
    num_col_list = list(df.select_dtypes(include=['float64', 'int64']).columns)
    fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(12, 16))
    axs = axs.flatten()
    
    for i, col in enumerate(num_col_list):
        if i < 12: # Only plot the first 12 numerical columns
            sns.boxplot(data=df[col], ax=axs[i])
            axs[i].set_title(col)

    # Remove any unused subplots
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])
plot_boxplot(median_imputed_df)        

: 

In [None]:
def relationship(df):
    sns.set_style()
    num_col_list = list(df.select_dtypes(include=['float64', 'int64']).columns)
    heatmap = sns.heatmap(df[num_col_list].corr(), vmin=-1, vmax=1, annot=True)
    heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':10}, pad=12)

relationship(median_imputed_df)    

: 

In [None]:
def plot_pairplot(df):
    """
    Plots a pairplot for a pandas DataFrame using the Seaborn library.
    
    Arguments:
    data -- pandas DataFrame
    
    Returns:
    None
    """
    # Create a pairplot using the Seaborn library
    sns.pairplot(df, size=2, corner=True)
    
    # Display the plot
    plt.show()

plot_pairplot(median_imputed_df)

: 

In [None]:
def scale_dataframe_robust(df):
    """
    Scales the columns of a pandas DataFrame using the RobustScaler from the Scikit-learn library.
    
    Arguments:
    data -- pandas DataFrame
    
    Returns:
    scaled_data -- pandas DataFrame
    """
    # Create a RobustScaler object
    scaler = RobustScaler()
    
    # Fit and transform the data
    scaled_data = scaler.fit_transform(df)
    
    # Convert the scaled data to a DataFrame
    scaled_data = pd.DataFrame(scaled_data, columns=df.columns)
    
    # Return the scaled data
    return scaled_data

df  = median_imputed_df.drop('galaxy', axis=1)
scaled_df = scale_dataframe_robust(df)
scaled_df


: 

In [None]:
X = scaled_df.drop("Well-Being Index", axis=1).values
y = scaled_df["Well-Being Index"].values
print(type(X), type(y))

: 

In [None]:
def plot_actual_vs_predicted(model, X, y):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the specified model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    mse_score = mse(y_test, y_pred)
    
    # Print the mean squared error and return the trained model
    print(f"Mean squared error: {mse_score:.2f}")
    print(np.sqrt(mse_score))
    r2 = r2_score(y_test, y_pred)
    print("R2:", r2)

    # Plot the actual vs predicted values
    plt.scatter(y_test, y_pred)
    plt.xlabel('Actual Labels')
    plt.ylabel('Predicted Labels')
    plt.title('Actual vs Predicted Values')
    
    # Overlay the regression line
    z = np.polyfit(y_test, y_pred, 1)
    p = np.poly1d(z)
    plt.plot(y_test, p(y_test), color='magenta')
    plt.show()


: 

In [None]:
plot_actual_vs_predicted(LinearRegression(),X,y)

: 

In [None]:
plot_actual_vs_predicted(DecisionTreeRegressor(), X,y)

: 

In [None]:
from sklearn.tree import DecisionTreeRegressor
plot_actual_vs_predicted(DecisionTreeRegressor(),X,y)

: 

In [None]:
plot_actual_vs_predicted(GradientBoostingRegressor(), X,y)

: 

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
plot_actual_vs_predicted(GradientBoostingRegressor(),X,y)

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print ('Training Set: %d rows\nTest Set: %d rows' % (X_train.shape[0], X_test.shape[0]))

: 

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
# configure to select all features
best_features = SelectKBest(score_func=f_regression, k='all')
# learn relationship from training data
best_features.fit(X_train, y_train)
# transform train input data
X_train_best_features = best_features.transform(X_train)
# transform test input data
X_test_best_features = best_features.transform(X_test)

: 

In [None]:
for i in range(len(best_features.scores_)):
    print('Feature %d: %f' % (i, best_features.scores_[i]))
# plot the scores
plt.bar([i for i in range(len(best_features.scores_))], best_features.scores_)
plt.show()

: 

In [None]:
def cross_validate_models(model, X, y):
    kf = KFold(n_splits=6, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, X, y, cv=kf)
    print(f"CV results: {cv_results.mean():.3f} (+/- {cv_results.std():.3f})")

: 

In [None]:
cross_validate_models(DecisionTreeRegressor(random_state=42), X, y)

: 

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {"alpha": np.arange(0.0001, 1, 10),"solver": ["sag", "lsqr"]}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=kf)
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, ridge_cv.best_score_)



: 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {'alpha': np.arange(0.0001, 1, 10)}
grad = GradientBoostingRegressor()
grad_cv = RandomizedSearchCV(grad, param_grid, cv=kf, n_iter=2)
grad_cv.fit(X_train, y_train)
print(grad_cv.best_params_, grad_cv.best_score_)


: 

In [None]:
#evaluating on the test set
test_score = grad_cv.score(X_test, y_test)
print(test_score)


: 

In [None]:
X = scaled_df.drop("Well-Being Index", axis=1).values
y = scaled_df['Well-Being Index'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
# evaluating regression models
models = {"Linear Regression": RandomForestRegressor(), "Gradient Boost": GradientBoostingRegressor(), "Decision Tree": DecisionTreeRegressor()}
results = []
for model in models.values():     
    kf = KFold(n_splits=6, random_state=42, shuffle=True)     
    cv_results = cross_val_score(model, X_train, y_train, cv=kf)     
    results.append(cv_results)
plt.boxplot(results, labels=models.keys())
plt.show()


: 

In [None]:
for name, model in models.items():  
    model.fit(X_train, y_train)  
    test_score = model.score(X_test, y_test)
    print("{} Test Set Accuracy: {}".format(name, test_score))

: 

: 