In [None]:
import pandas as pd

# Load the datasets
icrisat_data = pd.read_csv('https://raw.githubusercontent.com/Cropdata5320/CropData_Visualizations/main/ICRISAT_cleaned.csv')
food_prices_data = pd.read_csv('https://raw.githubusercontent.com/Cropdata5320/CropData_Visualizations/main/Food_Prices_cleaned.csv')
temperature_data = pd.read_csv('https://raw.githubusercontent.com/Cropdata5320/CropData_Visualizations/main/temperature.csv')
rainfall_data = pd.read_csv('https://raw.githubusercontent.com/Cropdata5320/CropData_Visualizations/main/rainfall.csv')

# Ensure the 'Year' columns are of the same data type
icrisat_data['Year'] = icrisat_data['Year'].astype(int)
food_prices_data['Year'] = food_prices_data['Year'].astype(int)
temperature_data['YEAR'] = temperature_data['YEAR'].astype(int)
rainfall_data['YEAR'] = rainfall_data['YEAR'].astype(int)



In [None]:
# Convert this wide format into a long format
yield_data_long = icrisat_data.melt(id_vars=['Dist Code', 'Year', 'State Code', 'State Name', 'Dist Name'],
                                     value_vars=[col for col in icrisat_data.columns if 'YIELD' in col],
                                     var_name='commodity_yield', value_name='yield')

# Extract the commodity name from the 'commodity_yield' column
yield_data_long['commodity'] = yield_data_long['commodity_yield'].str.split(' ').str[0]

# Now, you can aggregate this long-format dataframe to get the average yield for each commodity and year
average_yield_by_commodity_year = yield_data_long.groupby(['State Name', 'Year', 'commodity'])['yield'].mean().reset_index()

average_yield_by_commodity_year

Unnamed: 0,State Name,Year,commodity,yield
0,Andhra Pradesh,1966,BARLEY,0.000000
1,Andhra Pradesh,1966,CASTOR,104.343636
2,Andhra Pradesh,1966,CHICKPEA,265.150909
3,Andhra Pradesh,1966,COTTON,37.572727
4,Andhra Pradesh,1966,FINGER,915.929091
...,...,...,...,...
23869,West Bengal,2017,SORGHUM,33.333333
23870,West Bengal,2017,SOYABEAN,90.000000
23871,West Bengal,2017,SUGARCANE,9217.251333
23872,West Bengal,2017,SUNFLOWER,1128.241333


In [None]:
# Aggregate Temperature data for average annual temperature
average_temp_by_year = temperature_data.groupby('YEAR')['ANNUAL'].mean().reset_index()

# Aggregate Rainfall data for total annual rainfall
total_rainfall_by_year = rainfall_data.groupby('YEAR')['ANN'].sum().reset_index()

# Convert prices to numeric in Food Prices data
food_prices_data['price'] = pd.to_numeric(food_prices_data['price'], errors='coerce')
# Aggregate Food Prices data for average annual prices by commodity
average_prices_by_year = food_prices_data.groupby(['Year', 'commodity'])['price'].mean().reset_index()

# For average_prices_by_year dataframe
average_prices_by_year['commodity'] = average_prices_by_year['commodity'].str.split().str[0]  # Keep only the first word
average_prices_by_year['commodity'] = average_prices_by_year['commodity'].str.upper()  # Convert to upper case

average_prices_by_year

Unnamed: 0,Year,commodity,price
0,1994,OIL,35.930894
1,1994,RICE,7.850000
2,1994,SUGAR,14.388672
3,1994,WHEAT,5.989109
4,1995,RICE,8.119643
...,...,...,...
353,2023,SUGAR,50.876776
354,2023,TEA,277.091033
355,2023,TOMATOES,37.161306
356,2023,WHEAT,32.694242


In [None]:
# Now you have a 'commodity' column which you can use to merge with `average_prices_by_year`
combined_data_with_commodity = pd.merge(
    average_yield_by_commodity_year,
    average_temp_by_year,
    left_on='Year',
    right_on='YEAR',
    how='left'
)

combined_data_with_commodity = pd.merge(
    combined_data_with_commodity,
    total_rainfall_by_year,
    on='YEAR',
    how='left'
)

# For combined_data_with_commodity dataframe
combined_data_with_commodity = combined_data_with_commodity.drop(columns=['YEAR'])  # Drop the 'YEAR' column
combined_data_with_commodity = combined_data_with_commodity.rename(
    columns={
        'ANNUAL': 'Avg_Annual_Temperature',
        'ANN': 'Avg_Annual_Rainfall'
    }
)  # Rename the columns

combined_data_with_commodity

Unnamed: 0,State Name,Year,commodity,yield,Avg_Annual_Temperature,Avg_Annual_Rainfall
0,Andhra Pradesh,1966,BARLEY,0.000000,24.36,1058.0
1,Andhra Pradesh,1966,CASTOR,104.343636,24.36,1058.0
2,Andhra Pradesh,1966,CHICKPEA,265.150909,24.36,1058.0
3,Andhra Pradesh,1966,COTTON,37.572727,24.36,1058.0
4,Andhra Pradesh,1966,FINGER,915.929091,24.36,1058.0
...,...,...,...,...,...,...
23869,West Bengal,2017,SORGHUM,33.333333,24.79,1211.0
23870,West Bengal,2017,SOYABEAN,90.000000,24.79,1211.0
23871,West Bengal,2017,SUGARCANE,9217.251333,24.79,1211.0
23872,West Bengal,2017,SUNFLOWER,1128.241333,24.79,1211.0


In [None]:
combined_data_with_commodity = pd.merge(
    combined_data_with_commodity,
    average_prices_by_year,
    on=['Year', 'commodity'],
    how='left'
)

# Now, since you have the average yield, you might want to have a separate column for the average price
combined_data_with_commodity['average_price'] = combined_data_with_commodity.groupby(['State Name', 'Year', 'commodity'])['price'].transform('mean')

combined_data_with_commodity

Unnamed: 0,State Name,Year,commodity,yield,Avg_Annual_Temperature,Avg_Annual_Rainfall,price,average_price
0,Andhra Pradesh,1966,BARLEY,0.000000,24.36,1058.0,,
1,Andhra Pradesh,1966,CASTOR,104.343636,24.36,1058.0,,
2,Andhra Pradesh,1966,CHICKPEA,265.150909,24.36,1058.0,,
3,Andhra Pradesh,1966,COTTON,37.572727,24.36,1058.0,,
4,Andhra Pradesh,1966,FINGER,915.929091,24.36,1058.0,,
...,...,...,...,...,...,...,...,...
24029,West Bengal,2017,SOYABEAN,90.000000,24.79,1211.0,,
24030,West Bengal,2017,SUGARCANE,9217.251333,24.79,1211.0,,
24031,West Bengal,2017,SUNFLOWER,1128.241333,24.79,1211.0,,
24032,West Bengal,2017,WHEAT,2528.984667,24.79,1211.0,190.711391,108.781435


In [None]:
combined_data_with_commodity.isnull().sum()

State Name                    0
Year                          0
commodity                     0
yield                         0
Avg_Annual_Temperature        0
Avg_Annual_Rainfall           0
price                     22918
average_price             22918
dtype: int64

In [None]:
combined_data_with_commodity = combined_data_with_commodity.dropna()
combined_data_with_commodity = combined_data_with_commodity.drop(columns=['price'])  # Drop the 'price' column
print(combined_data_with_commodity.info())
combined_data_with_commodity

<class 'pandas.core.frame.DataFrame'>
Index: 1116 entries, 659 to 24033
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State Name              1116 non-null   object 
 1   Year                    1116 non-null   int64  
 2   commodity               1116 non-null   object 
 3   yield                   1116 non-null   float64
 4   Avg_Annual_Temperature  1116 non-null   float64
 5   Avg_Annual_Rainfall     1116 non-null   float64
 6   average_price           1116 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 69.8+ KB
None


Unnamed: 0,State Name,Year,commodity,yield,Avg_Annual_Temperature,Avg_Annual_Rainfall,average_price
659,Andhra Pradesh,1994,RICE,2408.201818,24.46,1295.3,7.850000
666,Andhra Pradesh,1994,WHEAT,0.000000,24.46,1295.3,5.989109
682,Andhra Pradesh,1995,RICE,2443.793636,25.29,1242.4,8.119643
689,Andhra Pradesh,1995,WHEAT,0.000000,25.29,1242.4,5.947260
705,Andhra Pradesh,1996,RICE,2547.888182,24.55,1182.9,8.976590
...,...,...,...,...,...,...,...
24008,West Bengal,2016,WHEAT,2369.040667,25.15,1198.8,94.719927
24009,West Bengal,2016,WHEAT,2369.040667,25.15,1198.8,94.719927
24025,West Bengal,2017,RICE,2880.750000,24.79,1211.0,225.744603
24032,West Bengal,2017,WHEAT,2528.984667,24.79,1211.0,108.781435


In [None]:
# Save the DataFrame to a new CSV file
combined_data_with_commodity.to_csv('CropYield_Weather_Prices_combined.csv', index=False)

In [None]:
import altair as alt

# Create a box plot for average annual temperature by state
temp_boxplot = alt.Chart(combined_data_with_commodity).mark_boxplot().encode(
    x=alt.X('State Name:N', title='Region'),
    y=alt.Y('Avg_Annual_Temperature:Q', title='Average Annual Temperature (°C)'),
    color=alt.Color('State Name:N', scale=alt.Scale(scheme='accent')),
    tooltip=['State Name:N', 'Avg_Annual_Temperature:Q']
).properties(
    title='Distribution of Average Annual Temperature by Region'
)

# Create a box plot for total annual rainfall by state
rainfall_boxplot = alt.Chart(combined_data_with_commodity).mark_boxplot().encode(
    x=alt.X('State Name:N', title='Region'),
    y=alt.Y('Avg_Annual_Rainfall:Q', title='Total Annual Rainfall (mm)'),
    color=alt.Color('State Name:N', scale=alt.Scale(scheme='accent')),
    tooltip=['State Name:N', 'Avg_Annual_Rainfall:Q']
).properties(
    title='Distribution of Total Annual Rainfall by Region'
)

# Assuming 'yield' column exists in climate_data
# Create a box plot for crop yield by state
yield_boxplot = alt.Chart(combined_data_with_commodity).mark_boxplot().encode(
    x=alt.X('State Name:N', title='Region'),
    y=alt.Y('yield:Q', title='Crop Yield (kg/ha)'),
    color=alt.Color('State Name:N', scale=alt.Scale(scheme='accent')),
    tooltip=['State Name:N', 'yield:Q']
).properties(
    title='Distribution of Crop Yield by Region'
)


# Combine the charts horizontally
combined_charts = alt.hconcat(temp_boxplot, rainfall_boxplot, yield_boxplot, spacing=30,
                              title="Distribution of Climatic Conditions & Crop Yield in different Regions"
                             ).resolve_scale(color='shared', y='independent')

combined_charts.display()