In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import ttest_ind

# import drive
from google.colab import drive
drive.mount('/content/drive/')

# ----------------------------- HYPOTHESIS 1 ----------------------------------#
# Hypothesis 1: There is a positive correlation between annual precipitation and crop yield in the dataset.

# read in joined dataset 
df = pd.read_csv("/content/drive/My Drive/Data Science: Food Security/data/faostat_projected.csv")

# drop any empty rows from dataset
df = df.dropna()

# drop duplicate rows
df = df.drop_duplicates(subset=None, keep="first", inplace=False)

# create a clean df that doesn't contain duplicate values in the two columns
df_clean = df[['Current Annual Precipitation (mm) _area_weighted', 'Yield']].dropna()
df_clean = df_clean.drop_duplicates(subset=['Current Annual Precipitation (mm) _area_weighted'])
df_clean = df_clean.drop_duplicates(subset=['Yield'])

# calculate Pearson's correlation
corr, pval = pearsonr(df_clean['Current Annual Precipitation (mm) _area_weighted'], df_clean['Yield'])

print("--HYPOTHESIS 1--")
# print correlation
# -0.305 --> Moderate Negative Correlation
print('Pearsons correlation: %.3f' % corr)
# print p-value
print('p-value: %.3f' % pval)

print("\n")

# ----------------------------- HYPOTHESIS 2 ----------------------------------#
# Hypothesis 2: There is a significant difference in crop yield between countries that use irrigation and countries that don't use irrigation.

# create a clean df with no null or duplicate values for the Irrigation and Yield column
df_clean_2 = df[['Irrigation', 'Yield']].dropna()
df_clean_2 = df_clean_2.drop_duplicates(subset=['Irrigation', 'Yield'])

# Split the dataset into two groups: countries that use irrigation and countries that don't use irrigation
df_irrigation = df_clean_2[df_clean_2['Irrigation'] == 'Yes']
df_noirrigation = df_clean_2[df_clean_2['Irrigation'] == 'No']

# calculate the mean of the yield column for the irrigation dataframe
mean_irrigation = df_irrigation['Yield'].mean()

# calculate the mean of the yield column for the no irrigation dataframe
mean_noirrigation = df_noirrigation['Yield'].mean()

print("--HYPOTHESIS 2--")
# print the mean of the yield column for the irrigation dataframe
print('Mean yield for countries that use irrigation: %.3f' % mean_irrigation)

# print the mean of the yield column for the no irrigation dataframe
print('Mean yield for countries that do not use irrigation: %.3f' % mean_noirrigation)

# calculate the t-test
ttest, pval = ttest_ind(df_irrigation['Yield'], df_noirrigation['Yield'])

# print the t-test
print('t-test: %.3f' % ttest)

# print the p-value
print('p-value: %.3f' % pval)

print("\n")

# ----------------------------- HYPOTHESIS 3 ----------------------------------#
# Hypothesis 3: There is a significant correlation between CO2 ppm and crop yield in the dataset.

# create a clean df that doesn't contain duplicate or null values in the two columns
df_clean_3 = df[['Yield', 'CO2 ppm']].dropna()
df_clean_3 = df_clean_3.drop_duplicates(subset=['Yield', 'CO2 ppm'])

# calculate Pearson's correlation
corr, pval = pearsonr(df_clean_3['CO2 ppm'], df_clean_3['Yield'])

print("--HYPOTHESIS 3--")
# print correlation
# -0.032 --> Weak Negative Correlation
print('Pearsons correlation: %.3f' % corr)
# print p-value
print('p-value: %.3f' % pval)

# Since the p-value > 0.05, the result is not statistically significant, and we fail to reject the null hypothesis. This means that there is not a statistically
# significant correlation between CO2 ppm and crop yield. 

print("\n")

Mounted at /content/drive/
--HYPOTHESIS 1--
Pearsons correlation: -0.209
p-value: 0.035


--HYPOTHESIS 2--
Mean yield for countries that use irrigation: 2.356
Mean yield for countries that do not use irrigation: 3.596
t-test: -2.290
p-value: 0.023


--HYPOTHESIS 3--
Pearsons correlation: -0.032
p-value: 0.166


