# Avocado Data Analysis Notebook
ALT-TAB LABS LLP &copy; 2020 All Rights Reserved

In [147]:
# show files in current directory
import os
os.listdir()

['75%.py',
 '80%.ipynb',
 'avocado-submission.csv',
 'avocado-test.csv',
 'avocado-train.csv',
 'avocado_notebook_other_models.ipynb',
 'pipeline test.ipynb',
 'pyctfsglib.py',
 '__pycache__']

In [148]:
# load csvs
import pandas as pd
df = pd.read_csv('avocado-train.csv', index_col='id')

df.head()

Unnamed: 0_level_0,DataBatch,Date,TotalVolume,4046,4225,4770,TotalBags,SmallBags,LargeBags,XLargeBags,type,region,AveragePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7,2016-11-06,183542.31,98949.98,22891.61,95.0,61605.72,43571.99,17499.01,534.72,conventional,NewOrleansMobile,1.49
1,24,2017-07-16,224434.92,42951.31,120360.02,131.85,60991.74,53141.81,3621.04,4228.89,conventional,HarrisburgScranton,1.38
2,51,2015-01-04,3846.69,1500.15,938.35,0.0,1408.19,1071.35,336.84,0.0,organic,Atlanta,1.76
3,22,2015-07-26,91825.07,1679.28,45615.48,741.77,43788.54,43788.54,0.0,0.0,conventional,BuffaloRochester,1.39
4,50,2015-01-11,54644.32,1491.88,33759.12,1325.17,18068.15,12165.94,5902.21,0.0,conventional,Pittsburgh,1.54


In [149]:
### Dealing with Missing Values ########################################################
# https://www.kaggle.com/alexisbcook/missing-values
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

def dropColumns(X_train, X_valid):
    # Get names of columns with missing values
    cols_with_missing = [col for col in X_train.columns
                        if X_train[col].isnull().any()]

    # Drop columns in training and validation data
    reduced_X_train = X_train.drop(cols_with_missing, axis=1)
    reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
    return reduced_X_train, reduced_X_valid


def imputing(X_train, X_valid):
    # Imputation
    from sklearn.impute import SimpleImputer
    my_imputer = SimpleImputer()
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

    # Imputation removed column names; put them back
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns
    return imputed_X_train, imputed_X_valid

def imputePlus(X_train, X_valid):
    X_train_plus = X_train.copy()
    X_valid_plus = X_valid.copy()

    # Make new columns indicating what will be imputed
    # Get names of columns with missing values
    cols_with_missing = [col for col in X_train.columns
                        if X_train[col].isnull().any()]
    for col in cols_with_missing:
        X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
        X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

    # Imputation
    from sklearn.impute import SimpleImputer
    my_imputer = SimpleImputer()
    imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
    imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

    # Imputation removed column names; put them back
    imputed_X_train_plus.columns = X_train_plus.columns
    imputed_X_valid_plus.columns = X_valid_plus.columns
    return imputed_X_train_plus, imputed_X_valid_plus

### Dealing with Categorial Variables ##################################################
def getNumericColumns(X):
    return [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    
def getCategories(X_train):
    # Get list of categorical variables
    s = (X_train.dtypes == 'object')
    object_cols = list(s[s].index)
    return object_cols

def dropCategories(X_train, X_valid):
    object_cols = getCategories(X_train)
    drop_X_train = X_train.select_dtypes(exclude=['object'])
    drop_X_valid = X_valid.select_dtypes(exclude=['object'])
    return drop_X_train, drop_X_valid, Y_train, Y_valid

# Label Encoding
def labelEncoding(X_train, X_valid):
    object_cols = getCategories(X_train)

    from sklearn.preprocessing import LabelEncoder
    # Make copy to avoid changing original data 
    label_X_train = X_train.copy()
    label_X_valid = X_valid.copy()

    # Apply label encoder to each column with categorical data
    label_encoder = LabelEncoder()
    for col in object_cols:
        label_X_train[col] = label_encoder.fit_transform(X_train[col])
        label_X_valid[col] = label_encoder.transform(X_valid[col])

    return label_X_train, label_X_valid, Y_train, Y_valid

### One Hot Encoding
def oneHotEncoding(X_train, X_valid, debug=True):
    object_cols = getCategories(X_train)

    from sklearn.preprocessing import OneHotEncoder
    # Apply one-hot encoder to each column with categorical data
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

    # One-hot encoding removed index; put it back
    OH_cols_train.index = X_train.index
    OH_cols_valid.index = X_valid.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = X_train.drop(object_cols, axis=1)
    num_X_valid = X_valid.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

    return OH_X_train, OH_X_valid

In [171]:
# Convert strings to numbers
c = getNumericColumns(df)
for column in c:
    df[column] = pd.to_numeric(df[column])

# Select data for learning
features = c[:-1]+["type","region"]
X = df[features]
Y = df["AveragePrice"]

# Split training into some for training and some for testing
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print("training split: ", len(X_train), "; test split: ", len(X_test))

training split:  8558 ; test split:  4216


In [172]:
# View the data!
#df.head()
X_train, X_test = oneHotEncoding(X_train, X_test)
X_train, X_test = imputePlus(X_train, X_test)

In [152]:
df.head()

Unnamed: 0_level_0,DataBatch,Date,TotalVolume,4046,4225,4770,TotalBags,SmallBags,LargeBags,XLargeBags,type,region,AveragePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7,2016-11-06,183542.31,98949.98,22891.61,95.0,61605.72,43571.99,17499.01,534.72,conventional,NewOrleansMobile,1.49
1,24,2017-07-16,224434.92,42951.31,120360.02,131.85,60991.74,53141.81,3621.04,4228.89,conventional,HarrisburgScranton,1.38
2,51,2015-01-04,3846.69,1500.15,938.35,0.0,1408.19,1071.35,336.84,0.0,organic,Atlanta,1.76
3,22,2015-07-26,91825.07,1679.28,45615.48,741.77,43788.54,43788.54,0.0,0.0,conventional,BuffaloRochester,1.39
4,50,2015-01-11,54644.32,1491.88,33759.12,1325.17,18068.15,12165.94,5902.21,0.0,conventional,Pittsburgh,1.54


In [153]:
'''
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
scX.fit(X_train)
X_test = scX.transform(X_test)

# Performng PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=None)
pca.fit(X_train)
X_test = pca.transform(X_test)
explainedvariance = pca.explained_variance_ratio_

# Importing model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
regressor.score(X_test,Y_test)
'''

'\nfrom sklearn.preprocessing import StandardScaler\nscX = StandardScaler()\nscX.fit(X_train)\nX_test = scX.transform(X_test)\n\n# Performng PCA\nfrom sklearn.decomposition import PCA\npca = PCA(n_components=None)\npca.fit(X_train)\nX_test = pca.transform(X_test)\nexplainedvariance = pca.explained_variance_ratio_\n\n# Importing model\nfrom sklearn.linear_model import LinearRegression\nregressor = LinearRegression()\nregressor.fit(X_train, Y_train)\nregressor.score(X_test,Y_test)\n'

In [154]:
'''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
Y_train = np.asarray(Y_train, dtype="|S6")
Y_test = np.asarray(Y_test, dtype="|S6")

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(30,30,30))
model.fit(X_train,Y_train)
model.score(X_train,Y_train)
'''

'\nfrom sklearn.preprocessing import StandardScaler\nscaler = StandardScaler()\n# Fit only to the training data\nscaler.fit(X_train)\nX_train = scaler.transform(X_train)\nX_test = scaler.transform(X_test)\nY_train = np.asarray(Y_train, dtype="|S6")\nY_test = np.asarray(Y_test, dtype="|S6")\n\nfrom sklearn.neural_network import MLPClassifier\nmodel = MLPClassifier(hidden_layer_sizes=(30,30,30))\nmodel.fit(X_train,Y_train)\nmodel.score(X_train,Y_train)\n'

# Regression Time!

In [155]:
# Sci Kit Learn! Scientific Computing library for python
from sklearn import *

In [173]:
# Pick the regression model we want to use

import sklearn.tree as tree
models = [
    tree.DecisionTreeRegressor(random_state=2020),
    ensemble.RandomForestRegressor(random_state=2020, n_estimators=100)
    #MLPClassifier(hidden_layer_sizes=(len(features),len(features),len(features)))
]


currScore = -10
model = None
for i in models:
    #print(i)
    i.fit(X_train,Y_train)
    score = i.score(X_test,Y_test)
    print(score)
    if score > currScore:
        model = i
        print(model)
        currScore = score

# https://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2



0.6350478505550481
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=2020, splitter='best')
0.8231919608917935
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=2020, verbose=0, warm_start=False)


In [174]:
# Score model
model.score(X_test, Y_test) #TODO TODO TODO TODO TODO TODO TODO TODO

0.8231919608917935

# Predict for Testing data

In [175]:
# Clean Data
MyDataFrame = pd.read_csv('avocado-test.csv', index_col='id')
# Drops missing values 
#MyDataFrame.dropna()
# Convert strings to numbers
# Convert strings to numbers
c = getNumericColumns(df)
for column in c:
    MyDataFrame[column] = pd.to_numeric(df[column])
MyDataFrame.head()

Unnamed: 0_level_0,DataBatch,Date,TotalVolume,4046,4225,4770,TotalBags,SmallBags,LargeBags,XLargeBags,type,region,AveragePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7,2016-07-31,183542.31,98949.98,22891.61,95.0,61605.72,43571.99,17499.01,534.72,organic,Chicago,1.49
1,24,2017-08-20,224434.92,42951.31,120360.02,131.85,60991.74,53141.81,3621.04,4228.89,conventional,NorthernNewEngland,1.38
2,51,2017-01-22,3846.69,1500.15,938.35,0.0,1408.19,1071.35,336.84,0.0,organic,GreatLakes,1.76
3,22,2015-10-04,91825.07,1679.28,45615.48,741.77,43788.54,43788.54,0.0,0.0,organic,CincinnatiDayton,1.39
4,50,2015-10-18,54644.32,1491.88,33759.12,1325.17,18068.15,12165.94,5902.21,0.0,organic,Houston,1.54


In [176]:
# Select features
XforPredictions =  MyDataFrame[features]#None #TODO TODO TODO TODO TODO TODO TODO TODO

#my_pipeline.fit(X_train, Y_train)
# Preprocessing of validation data, get predictions


X_train, X_test = oneHotEncoding(X, XforPredictions)
X_train, X_test = imputePlus(X_train ,X_test)


In [177]:
# Make Predictions!
yPredictions =  model.predict(X_test)# model.predict(X_test)# None #TODO TODO TODO TODO TODO TODO TODO TODO
print("Predictions:", yPredictions)

Predictions: [1.876  1.3383 1.6805 ... 1.3981 1.5591 1.5873]


In [178]:
# Save to CSV File!
XforPredictions =  MyDataFrame[features]
output = pd.DataFrame({'id': XforPredictions.index, 'AveragePrice': yPredictions})
output.to_csv('avocado-submission.csv', index=False)
print(output)

id  AveragePrice
0        0        1.8760
1        1        1.3383
2        2        1.6805
3        3        1.8574
4        4        1.5926
...    ...           ...
5470  5470        1.7320
5471  5471        1.4282
5472  5472        1.3981
5473  5473        1.5591
5474  5474        1.5873

[5475 rows x 2 columns]


# Upload for grading

In [168]:
# Download CTFSG Grader Libraries
import urllib.request, os
urllib.request.urlretrieve('https://raw.githubusercontent.com/alttablabs/ctfsg-utils/master/pyctfsglib.py', './pyctfsglib.py')
print('Downloaded pyctfsglib.py:', 'pyctfsglib.py' in os.listdir())

Downloaded pyctfsglib.py: True


In [169]:
# Connect to graders
import pyctfsglib as ctfsg
import random

USER_TOKEN = "MXhtGfdjdsUfiEKTHHEuVGohZESBdMiHrFkmYqNqIFfcWOHGvcubvHJvnxpAqRMh" # You need to fill this up
GRADER_URL = random.choice([
"http://challenges.csdc20t.ctf.sg:30001/", "http://challenges.csdc20t.ctf.sg:30002/"
])
grader = ctfsg.DSGraderClient(GRADER_URL, USER_TOKEN)

DSGraderClient: Successfully Connected!
[SERVER] MOTD: CHECK your USER_TOKEN and GRADER_URL HTTP address! I'm AVOCADO_PRICE TEST_GRADER_1


In [179]:
grader.submitFile('avocado-submission.csv')

ProofOfWork Challenge =>  ('CTFSGRB4964c7954faccdc6388e9a04b2be6c02', 22)
ProofOfWork Answer Found! =>  1459974


'{"challenge":{"name":"Avocado Prices"},"id":"ck8mt5qvg6v420868r669ol2k","status":"PARTIALLY_CORRECT","multiplier":0.1204,"submittedBy":{"username":"nyjc-1"},"createdAt":"2020-04-05T08:52:53Z"}'