In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

file_path = 'data/processed/cleaned_master.csv'
df = pd.read_csv(file_path)

# Define features and target variable
predictor = ['HDI for year']
response = 'suicides/100k pop'

# Initialize lists to store results
countries = []
slopes = []
r_squared_values = []
p_values = []

# Perform linear regression for each unique country
unique_countries = df['country'].unique()
for country in unique_countries:
    country_df = df[df['country'] == country]
    
    X = country_df[predictor]
    y = country_df[response]
    
    # Add constant to X for intercept
    X = sm.add_constant(X)
    
    # Fit linear regression model using statsmodels
    model = sm.OLS(y, X)
    results = model.fit()
    
    # Get slope, R-squared, and p-value
    slope = results.params['HDI for year']
    r_squared = results.rsquared
    p_value = results.pvalues['HDI for year']
    
    # Append results to lists
    countries.append(country)
    slopes.append(slope)
    r_squared_values.append(r_squared)
    p_values.append(p_value)

# Create DataFrame with results
results_df = pd.DataFrame({
    'country': countries,
    'slope': slopes,
    'r_squared': r_squared_values,
    'p_value': p_values
})

# Find the country with the highest slope based on HDI
country_with_max_slope = results_df.loc[results_df['slope'].idxmax()] 

# Print results
print(f"Highest slope value based on HDI: {country_with_max_slope['slope']}")
print(f"R-squared value for this country: {country_with_max_slope['r_squared']}")
print(f"P-value for HDI coefficient: {country_with_max_slope['p_value']}\n")


# Print significant results 
significant_results_df = results_df[results_df['p_value'] < 0.05]
if not significant_results_df.empty:
    MAX_slope_significant = significant_results_df.loc[significant_results_df['slope'].idxmax()]
    print(f"Country with highest significant slope: {MAX_slope_significant}/n")
else:
    print("No significant results found with p-value < 0.05")
print("A positive slope in the context of HDI (Human Development Index) vs suicide rate suggests that as the HDI increases, the suicide rate tends to increase as well. Countries with higher HDI generally have better socioeconomic indicators such as income, education, and life expectancy. These factors contribute to overall well-being but can also lead to higher expectations and pressures. Something we can target as a company")


  return 1 - self.ssr/self.centered_tss


Highest slope value based on HDI: 11853.419768318128
R-squared value for this country: 0.07333016905905998
P-value for HDI coefficient: 0.11016750827580955

Country with highest significant slope: country      Bosnia and Herzegovina
slope                   1001.759259
r_squared                   0.25146
p_value                    0.012545
Name: 13, dtype: object/n
A positive slope in the context of HDI (Human Development Index) vs suicide rate suggests that as the HDI increases, the suicide rate tends to increase as well. Countries with higher HDI generally have better socioeconomic indicators such as income, education, and life expectancy. These factors contribute to overall well-being but can also lead to higher expectations and pressures. Something we can target as a company


  return 1 - self.ssr/self.centered_tss
