**Loading Data**

In [1]:
import pandas as pd

# Load the district-level data
district_data = pd.read_csv('/content/Uganda_Karamoja_District_Crop_Yield_Population.csv')

# Load the subcounty-level data
subcounty_data = pd.read_csv('/content/Uganda_Karamoja_Subcounty_Crop_Yield_Population.csv')

# Display the first few rows
print("District Data:")
print(district_data.head())

print("\nSubcounty Data:")
print(subcounty_data.head())


District Data:
   OBJECTID     NAME     POP        Area  S_Yield_Ha  M_Yield_Ha  \
0        92     ABIM   90385  2771977106         449        1040   
1        96   AMUDAT  101790  1643582836         205        1297   
2        20  KAABONG  627057  7373606003         279         945   
3        85   KOTIDO  243157  3641539808         331        1148   
4         5   MOROTO  127811  3570160948         128         355   

   Crop_Area_Ha     S_Area_Ha    M_Area_Ha  S_Prod_Tot  M_Prod_Tot  
0   5470.068394   3277.295971  1848.621855     1471506     1922567  
1   5765.443719   2973.423860  2733.661014      609552     3545558  
2  28121.672530  20544.194960  7394.416334     5731830     6987723  
3  53032.649450  50247.443900  1751.372284    16631904     2010575  
4   5954.814048   4741.748776  1190.050606      606944      422468  

Subcounty Data:
   OBJECTID       SUBCOUNTY_NAME DISTRICT_NAME    POP        Area Karamoja  \
0       263              KACHERI        KOTIDO  17244  1067176155  

**Data Inspection**

In [2]:
# Check for missing values
print("Missing values in district data:")
print(district_data.isnull().sum())

print("\nMissing values in subcounty data:")
print(subcounty_data.isnull().sum())

# Check for duplicates
print("\nDuplicates in district data:", district_data.duplicated().sum())
print("Duplicates in subcounty data:", subcounty_data.duplicated().sum())


Missing values in district data:
OBJECTID        0
NAME            0
POP             0
Area            0
S_Yield_Ha      0
M_Yield_Ha      0
Crop_Area_Ha    0
S_Area_Ha       0
M_Area_Ha       0
S_Prod_Tot      0
M_Prod_Tot      0
dtype: int64

Missing values in subcounty data:
OBJECTID          0
SUBCOUNTY_NAME    0
DISTRICT_NAME     0
POP               0
Area              0
Karamoja          0
S_Yield_Ha        0
M_Yield_Ha        0
Crop_Area_Ha      0
S_Area_Ha         0
M_Area_Ha         0
S_Prod_Tot        0
M_Prod_Tot        0
dtype: int64

Duplicates in district data: 0
Duplicates in subcounty data: 0


**Data Cleaning**

In [3]:
# Convert the 'Karamoja' column to boolean
subcounty_data['Karamoja'] = subcounty_data['Karamoja'].apply(lambda x: True if x == 'Y' else False)

# Check the changes to ensure the conversion was successful
print(subcounty_data[['SUBCOUNTY_NAME', 'DISTRICT_NAME', 'Karamoja']].head())


        SUBCOUNTY_NAME DISTRICT_NAME  Karamoja
0              KACHERI        KOTIDO      True
1               KOTIDO        KOTIDO      True
2  KOTIDO TOWN COUNCIL        KOTIDO      True
3         NAKAPERIMORU        KOTIDO      True
4           PANYANGARA        KOTIDO      True


**Data Merging**

In [4]:
# Merge the subcounty data with district data on the 'DISTRICT_NAME' field
merged_data = pd.merge(subcounty_data, district_data, left_on='DISTRICT_NAME', right_on='NAME', suffixes=('_subcounty', '_district'))

# Display the first few rows
print("Merged Data:")
print(merged_data.head())


Merged Data:
   OBJECTID_subcounty       SUBCOUNTY_NAME DISTRICT_NAME  POP_subcounty  \
0                 263              KACHERI        KOTIDO          17244   
1                 264               KOTIDO        KOTIDO          52771   
2                 265  KOTIDO TOWN COUNCIL        KOTIDO          27389   
3                 266         NAKAPERIMORU        KOTIDO          38775   
4                 267           PANYANGARA        KOTIDO          65704   

   Area_subcounty  Karamoja  S_Yield_Ha_subcounty  M_Yield_Ha_subcounty  \
0      1067176155      True            354.207411           1137.467019   
1       597575188      True            367.890523           1162.996687   
2        23972401      True            369.314177           1167.005832   
3       419111591      True            283.324569            852.366578   
4       880955930      True            373.836926           1283.859882   

   Crop_Area_Ha_subcounty  S_Area_Ha_subcounty  ...    NAME  POP_district  \
0       

**Remove Redundant Columns**

In [5]:
merged_data.drop(columns=['NAME'], inplace=True)


**Saving the data**

In [6]:
# Save the merged data
merged_data.to_csv('Merged_Karamoja_Crop_Yield_Population.csv', index=False)


In [7]:
from google.colab import files

# Download the file to my computer
files.download('Merged_Karamoja_Crop_Yield_Population.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>