In [None]:
import pandas as pd

# Provide the URL of the dataset
url = "https://github.com/dsrscientist/DSData/raw/master/happiness_score_dataset.csv"

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(url)

# Display the first few rows of the DataFrame to inspect the data
print(df.head())


In [None]:
# Handle missing values by either removing or imputing them
# For example, you can impute numerical columns with their mean
df.fillna(df.mean(), inplace=True)

# Check again for missing values to ensure they are handled
print("\nMissing values after handling:\n", df.isnull().sum())


In [None]:
# Calculate correlation coefficients between features and the target variable
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix["Happiness Score"].sort_values(ascending=False)

# Display correlation coefficients
print("Correlation with Happiness Score:\n", correlation_with_target)


In [None]:
# Set a threshold for correlation, below which features will be considered irrelevant
correlation_threshold = 0.1  # Adjust this threshold as needed

# Identify and drop irrelevant columns
irrelevant_columns = correlation_with_target[abs(correlation_with_target) < correlation_threshold].index
df.drop(columns=irrelevant_columns, inplace=True)

# Display the updated DataFrame
print("\nDataFrame after dropping irrelevant columns:\n", df.head())


In [None]:
# Check the data types of each column to identify categorical variables
print("Data types:\n", df.dtypes)

# If there are categorical variables, encode them using one-hot encoding
# Example assuming 'Country' is a categorical variable
df = pd.get_dummies(df, columns=['Country'], drop_first=True)

# Display the DataFrame after encoding
print("\nDataFrame after encoding categorical variables:\n", df.head())


In [None]:
from sklearn.preprocessing import StandardScaler

# Extract numerical columns for scaling
numerical_columns = df.select_dtypes(include=['float64']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Display the DataFrame after scaling
print("\nDataFrame after scaling numerical variables:\n", df.head())


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'Happiness Score' is the target variable
X = df.drop(columns=['Happiness Score'])
y = df['Happiness Score']

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training set
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Evaluate the model on the training set
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Training Set Performance:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)


In [None]:
# Make predictions on the testing set
y_test_pred = model.predict(X_test)

# Evaluate the model on the testing set
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print("Testing Set Performance:")
print("Mean Squared Error:", mse_test)
print("R-squared:", r2_test)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their possible values
param_grid = {
    'fit_intercept': [True, False],
    'normalize': [True, False],
}

# Initialize the Linear Regression model
model = LinearRegression()

# Create a GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on the training set
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the testing set using the best model
y_test_pred_tuned = best_model.predict(X_test)

# Evaluate the tuned model on the testing set
mse_test_tuned = mean_squared_error(y_test, y_test_pred_tuned)
r2_test_tuned = r2_score(y_test, y_test_pred_tuned)

print("\nTuned Model Performance on Testing Set:")
print("Mean Squared Error:", mse_test_tuned)
print("R-squared:", r2_test_tuned)


In [None]:
# Assuming you have a new dataset or new observations in a DataFrame called 'new_data'
# Make sure 'new_data' has the same features as the original dataset (excluding the target variable)

# Use the best model from hyperparameter tuning to make predictions on new data
new_data_predictions = best_model.predict(new_data)

# Display the predictions
print("Predictions on new data:\n", new_data_predictions)


In [None]:
# Extract feature coefficients from the trained model
feature_coefficients = pd.Series(best_model.coef_, index=X_train.columns)

# Sort the coefficients by magnitude to identify important features
sorted_coefficients = feature_coefficients.abs().sort_values(ascending=False)

# Display the sorted coefficients
print("Feature coefficients:\n", sorted_coefficients)


In [None]:
import matplotlib.pyplot as plt

# Plot the feature importance
plt.figure(figsize=(12, 8))
sorted_coefficients.plot(kind='barh')
plt.title("Feature Importance in Predicting Happiness Score")
plt.xlabel("Coefficient Magnitude")
plt.ylabel("Feature")
plt.show()


In [None]:
# Calculate correlation between features and target variable
correlation_with_target = df.corr()['Happiness Score'].sort_values(ascending=False)

# Display the correlation coefficients
print("Correlation with Happiness Score:\n", correlation_with_target)


In [None]:
import statsmodels.api as sm

# Add a constant term to the features matrix (for intercept)
X_train_with_const = sm.add_constant(X_train)

# Fit a linear regression model with statsmodels to obtain p-values
model_stats = sm.OLS(y_train, X_train_with_const).fit()

# Display summary statistics, including p-values
print(model_stats.summary())
