<h3>Analysing Food Security in Kenya</h3>

In [2]:
# import libraries
import pandas as pd
import pyreadstat as pr
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import  stats
import warnings
import statsmodels.api as sm

In [3]:
# Ignore warnings
warnings.filterwarnings("ignore")

In [6]:
# Check the data directory
import os
# List all files and directories in the data folder
for root, dirs, files in os.walk("data"):
    for file in files:
        file_path = os.path.join(root, file)
        print(f"File: {file_path}")
      



File: data/SPSS Dataset Baseline Current.sav


In [8]:
# Set file path
SAV_FILE_PATH = "data/SPSS Dataset Baseline Current.sav"

# Create a folder for plots if it doesn't exist inorder to:
if not os.path.exists("plots"):
    os.makedirs("plots")

# Load the SPSS FILE
try:
    df, meta = pr.read_sav(SAV_FILE_PATH)
    print("✅ SAV file loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
except Exception as e:
    print(f"❌ Error loading SAV file: {e}")
    exit()


# Basic information about the DataFrame
print("\n--- DataFrame Info ---")
df.info()


# First few rows to get a glimpse of the data
print("\n--- First 5 Rows ---")
print(df.head())


✅ SAV file loaded successfully!
📊 Dataset shape: (944, 547)

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944 entries, 0 to 943
Columns: 547 entries, Country to HOUSEHOLD_SIZE
dtypes: float64(527), object(20)
memory usage: 3.9+ MB

--- First 5 Rows ---
   Country  Studysite  HH_code  B_HH_type  B_HH_count  B_m1_gender  \
0      1.0       12.0  12009.0        1.0         7.0          1.0   
1      1.0       12.0  12008.0        3.0         5.0          2.0   
2      2.0       22.0  22177.0        1.0        14.0          1.0   
3      2.0       22.0  22265.0        1.0         7.0          1.0   
4      2.0       21.0  21226.0        1.0         8.0          1.0   

   B_m1_resp_age  B_m1_relation_head  B_m1_school_years  B_m1_home_occup  ...  \
0           62.0                 1.0                0.0              1.0  ...   
1           60.0                 1.0                0.0              1.0  ...   
2           39.0                 1.0                0.0

In [9]:

# Get descriptive statistics for all columns
print("\n--- Descriptive Statistics ---")
print(df.describe(include="all"))


--- Descriptive Statistics ---
           Country   Studysite       HH_code   B_HH_type  B_HH_count  \
count   944.000000  944.000000    944.000000  943.000000  944.000000   
unique         NaN         NaN           NaN         NaN         NaN   
top            NaN         NaN           NaN         NaN         NaN   
freq           NaN         NaN           NaN         NaN         NaN   
mean      1.549788   17.029661  17139.824153    1.193001    5.829449   
std       0.497779    5.041837   5055.312377    0.590866    2.048599   
min       1.000000   11.000000  11001.000000    1.000000    1.000000   
25%       1.000000   12.000000  12014.750000    1.000000    4.000000   
50%       2.000000   21.000000  21047.500000    1.000000    6.000000   
75%       2.000000   22.000000  22053.250000    1.000000    7.000000   
max       2.000000   22.000000  22289.000000    3.000000   15.000000   

        B_m1_gender  B_m1_resp_age  B_m1_relation_head  B_m1_school_years  \
count    944.000000     94