# IMPORT USED LIBRARIES

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns


# TAKE THE FILE PATH

In [None]:
path_file = r"D:\Self Study\Project\engel_law\data\processed\engel_law.csv"
df = pd.read_csv(path_file, header = 0) # take the first row as the 
# check
# print (df.head(5))
# print (df.describe(include ="all"))

     newid  ref_yr  quarter  food_share  quarterly_income
0  5688684    2025        1    0.289093          50000.00
1  5686523    2024        4    0.298011           7500.00
2  5827122    2024        4    0.036222          21250.00
3  5680923    2024        4    0.042971           2666.00
4  5666482    2024        3    0.060795         133591.75
              newid        ref_yr       quarter    food_share  \
count  1.416900e+04  14169.000000  14169.000000  14169.000000   
mean   5.662254e+06   2024.199450      2.194156      0.169022   
std    1.557455e+05      0.399601      1.162869      0.200135   
min    5.343734e+06   2024.000000      1.000000      0.000000   
25%    5.566283e+06   2024.000000      1.000000      0.030711   
50%    5.668152e+06   2024.000000      2.000000      0.055639   
75%    5.816351e+06   2024.000000      3.000000      0.306079   
max    5.981451e+06   2025.000000      4.000000      0.984453   

       quarterly_income  
count      14169.000000  
mean       248

# CLEAN DATA

In [None]:
# convert data types
df["food_share"] = df["food_share"].astype(float)
df["quarterly_income"] = df["quarterly_income"].astype(float)
# check
# print(df["quarterly_income"].dtypes)
# print(df["food_share"].dtypes)

float64
float64


In [None]:
# remove missing value from food_share and quarterly_income horizontally
df.dropna(subset=["food_share","quarterly_income"], axis = 0, inplace= True)

# only take the rows where quarterly_income > 0
df = df[df["quarterly_income"] > 0]

# only take the row where food_share >= 0 and <= 1
df = df[(df["food_share"] >= 0)&(df["food_share"]<=1)]

In [None]:
# creat log_quarterly_income column
df["log_quarterly_income"] = np.log(df["quarterly_income"])
# check
# print(df.describe(include="all"))

              newid        ref_yr       quarter    food_share  \
count  1.416900e+04  14169.000000  14169.000000  14169.000000   
mean   5.662254e+06   2024.199450      2.194156      0.169022   
std    1.557455e+05      0.399601      1.162869      0.200135   
min    5.343734e+06   2024.000000      1.000000      0.000000   
25%    5.566283e+06   2024.000000      1.000000      0.030711   
50%    5.668152e+06   2024.000000      2.000000      0.055639   
75%    5.816351e+06   2024.000000      3.000000      0.306079   
max    5.981451e+06   2025.000000      4.000000      0.984453   

       quarterly_income  log_quarterly_income  
count      14169.000000          14169.000000  
mean       24856.217552              9.573370  
std        26818.192789              1.246612  
min            0.250000             -1.386294  
25%         8003.000000              8.987572  
50%        17000.000000              9.740969  
75%        32430.000000             10.386839  
max       255631.000000       

In [18]:
# check to make sure the data is clean
print((df["quarterly_income"] < 0).sum())
print((df["food_share"] < 0).sum())
print((df["food_share"] > 1).sum())

0
0
0


# DRAW THE DIAGRAM

In [22]:
# quarterly_income ~ food_share
plt.figure()
plt.scatter( df["quarterly_income"], df["food_share"])
plt.xlabel("Quarterly Income")
plt.ylabel("Food Share")
plt.title("Income and Food Share relationship")
plt.tight_layout()
plt.savefig("scatter_income_food_share.png",dpi=200)
plt.close()

In [32]:
# regression line for quarterly_income and food share
plt.figure()
sns.regplot(x = df["quarterly_income"], y = df["food_share"], data = df, line_kws={"color": "red"})
plt.ylim(0,)
plt.xlabel("Quarterly Income")
plt.ylabel("Food Share")
plt.title("Income and Food Share relationship")
plt.tight_layout()
plt.savefig("regline_income_food_share.png",dpi=200)
plt.close()

In [26]:
# log_quarterly_income ~ food_share
plt.figure()
plt.scatter(df["log_quarterly_income"],df["food_share"])
plt.xlabel("Log of Quarterly Income")
plt.ylabel("Food Share")
plt.title("Log Income and Food Share relationship")
plt.tight_layout()
plt.savefig("scatter_log_income_food_share.png",dpi=200)
plt.close()

In [31]:
# regression line for log_quarterly_income and food share
plt.figure()
sns.regplot(x = df["log_quarterly_income"], y = df["food_share"], data = df, line_kws={"color": "red"})
plt.ylim(0,)
plt.xlabel("Log of Quarterly Income")
plt.ylabel("Food Share")
plt.title("Log Income and Food Share relationship")
plt.tight_layout()
plt.savefig("regline_log_income_food_share.png",dpi=200)
plt.close()

# TAKE THE SUMMARY INFOMATION 

In [40]:
def summary_infor(a,b):
    x = sm.add_constant(a)
    y = b
    model = sm.OLS(y,x).fit()
    return model.summary()
# check
# print(summary_infor(df["quarterly_income"],df["food_share"]))
# print(summary_infor(df["log_quarterly_income"],df["food_share"]))

# SAVE SUMMARY INFORMATION FOR ANALYZING

In [42]:
with open("regression_food_share_log_income_result.txt","w",encoding="utf-8") as f:
    f.write("Model : FOOD_SHARE ~ LOG_INCOME\n\n")
    f.write(summary_infor(df["log_quarterly_income"],df["food_share"]).as_text())

with open("regression_food_share_income_result.txt","w",encoding="utf-8") as f:
    f.write("Model : FOOD_SHARE ~ INCOME\n\n")
    f.write(summary_infor(df["quarterly_income"],df["food_share"]).as_text())  