In [None]:
import warnings, requests, zipfile, io 
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import os
import boto3
import sagemaker
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sagemaker.image_uris import retrieve

In [None]:
# Specify the path to the ZIP file and the extraction directory
zip_file_path = 'food-twentieth-century-crop-statistics-1900-2017-xlsx.zip'  # Replace with the actual path to your ZIP file
extraction_path = './data'  # Specify the directory where you want to extract the contents

# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# List the files in the extraction directory to see the extracted files
extracted_files = os.listdir(extraction_path)
print("Extracted files:", extracted_files)

# Assuming you want to read the first Excel file in the extraction directory
excel_file_path = os.path.join(extraction_path, extracted_files[0])
df = pd.read_excel(excel_file_path)

# Now, 'df' contains your data as a DataFrame that you can work with.

In [None]:
# Load the Excel file
file = pd.ExcelFile("data/food-twentieth-century-crop-statistics-1900-2017-xlsx.xlsx")

# Read data from a specific sheet (e.g., 'CropStats')
sheet_name = 'CropStats'  # Replace with the name of the sheet you want to load
df = file.parse(sheet_name)

# Set the index to the first column and remove the index name
df = df.set_index(df.columns[0])
df.index.name = None

# Now, 'df' contains the data from the specified sheet with the index set as described.


In [None]:
df.head()

In [None]:
#cleaning
#df.drop(['admin2', 'notes', 'Harvest_year'], axis=1, inplace=True)
df.rename(columns = {'admin0': 'national', 'admin1': 'subnational', 'hectares (ha)': 'hectares_ha', 'production (tonnes)': 'production_tonnes', 'yield(tonnes/ha)': 'yield_tonnes_ha'}, inplace=True)
df.loc[df['subnational'].isna(), 'subnational'] = df['national']

mask = df['yield_tonnes_ha'].isna() & ~df['production_tonnes'].isna() & ~df['hectares_ha'].isna() & df['hectares_ha'] != 0
df.loc[mask, 'yield_tonnes_ha'] = df['production_tonnes'] / df['hectares_ha']
df.dropna(subset=['yield_tonnes_ha'], inplace=True)
len(df[~pd.isnull(df['hectares_ha']) & pd.isnull(df['production_tonnes']) & ~pd.isnull(df['yield_tonnes_ha'])])
# The mask is used here because of I did the same way as above it kept timing out
mask = df['production_tonnes'].isna() & ~df['yield_tonnes_ha'].isna() & ~df['hectares_ha'].isna()
df.loc[mask, 'production_tonnes'] = df['yield_tonnes_ha'] * df['hectares_ha']
df.dropna(subset=['production_tonnes'], inplace=True)
mask = df['hectares_ha'].isna() & ~df['yield_tonnes_ha'].isna() & ~df['production_tonnes'].isna()
df.loc[mask, 'hectares_ha'] = df['yield_tonnes_ha'] * df['production_tonnes']
df.dropna(subset=['hectares_ha'], inplace=True)
# The columns we just adapted just changed into objects, let's make them floats again
df['hectares_ha'] = df['hectares_ha'].astype(float)
df['production_tonnes'] = df['production_tonnes'].astype(float)
df['yield_tonnes_ha'] = df['yield_tonnes_ha'].astype(float)
df['log_yield'] = np.log1p(df['yield_tonnes_ha'])
df['log_hectares'] = np.log1p(df['hectares_ha'])
df['log_production'] = np.log1p(df['production_tonnes'])

In [None]:
#data sample
df.shape
df.head(20)

In [None]:
#train and validate model
train, test_and_validate = train_test_split(df, 
                                            test_size=0.2, 
                                            random_state=42) 
                                            #stratify=df['yield_tonnes_ha'])

In [None]:
#test and validate model
test, validate = train_test_split(test_and_validate, 
                                  test_size=0.5, 
                                  random_state=42)
                                 #stratify=df['yield(tonnes/ha)'])

In [None]:
print(train.shape)
print(test.shape)
print(validate.shape)

t1 = train['yield_tonnes_ha'].value_counts()
t2 = test['yield_tonnes_ha'].value_counts()
t3 = validate['yield_tonnes_ha'].value_counts()
result = pd.concat([t1,t2,t3], axis=1, sort=False)
result

In [None]:
#upload to s3 bucket
bucket='c93435a2086654l5105130t1w6478590828-sandboxbucket-qute15kbvwmy'
prefix='mod03-demo-training-a-model'
train_file='crop_train.csv'
test_file='crop_test.csv'
validate_file='crop_validate.csv'
whole_file='crop.csv'
s3_resource = boto3.Session().resource('s3')

def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False )
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

In [None]:
role=sagemaker.get_execution_role()
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

In [None]:
hyperparams={
    "num_round":"40",
    "num_class":"7",
    "objective":"multi:softmax"}

In [None]:
xgb_model=sagemaker.estimator.Estimator(container,
                                        role,
                                        instance_count=1,
                                        instance_type='ml.m4.xlarge',
                                        output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

In [None]:
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

In [None]:
#input channels
xgb_model.fit(inputs=data_channels, logs=False)

In [None]:
s=sagemaker.analytics.TrainingJobAnalytics(xgb_model._current_job_name, 
                                         metric_names = ['train:merror', 
                                                         'validation:merror']
                                        )

s_df=s.dataframe()
s_df = s_df.iloc[:,1:3]
s_df
#(wrong cases)/#(all cases)

<font color='yellow'>This model gives back double 0</font>

In [None]:
s=sagemaker.analytics.TrainingJobAnalytics(xgb_model._current_job_name, 
                                         metric_names = ['train:merror', 
                                                         'validation:merror']
                                        )

s_df=s.dataframe()
s_df = s_df.iloc[:,1:12]
s_df
#(wrong cases)/#(all cases)

<font color='yellow'>This model gives back double 0</font>

In [1]:
hyperparams={
    "num_round":"40",
    "num_class":"11",
    "objective":"multi:softmax"}

In [None]:
xgb_model=sagemaker.estimator.Estimator(container,
                                        role,
                                        instance_count=11,
                                        instance_type='ml.m4.xlarge',
                                        output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

<font color='yellow'>This model gives back 6 times 0</font>