In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from scipy import stats

#Purpose: to find the differences in performance between data manipulation types
#Final product: to get a dataframe that will create a proportional stacked bar chart by fertilizer type

df = pd.read_csv('../data/FertilizersProduct.csv')
df.head()

final_times = dict()
iterations = 50
divisor = 100

store_time = np.empty(shape=iterations, dtype='float64')
time_std = np.empty(shape=6, dtype='float64')

In [2]:
#Vectorization - uses built in math ops or converting between wide/long or use groupby/filter/transform...
#Does not work well with incomplete datasets (with nan)

for i in range(iterations):
    dfpivot = df.copy(deep=False)

    start = time.time()

    dfpivot = dfpivot[dfpivot['Unit']=='tonnes'].drop(columns=['Unit'])
    dfpivot = dfpivot.groupby(['Area','Item'], as_index=False)['Value'].sum()
    dfpivot = dfpivot.pivot(index='Area', columns='Item', values='Value')

    dfpivot = dfpivot.fillna(0.0)

    store_time[i] = time.time()-start

print("Vectorization took: ",np.mean(store_time)," seconds")

final_times['Vect'] = np.mean(store_time)
time_std[0] = np.std(store_time)

dfpivot.head()

Vectorization took:  0.14615270614624024  seconds


Item,"Ammonia, anhydrous",Ammonium nitrate (AN),Ammonium sulphate,Calcium ammonium nitrate (CAN) and other mixtures with calcium carbonate,Diammonium phosphate (DAP),Fertilizers n.e.c.,Monoammonium phosphate (MAP),NPK fertilizers,Other NK compounds,Other NP compounds,...,PK compounds,Phosphate rock,Potassium chloride (muriate of potash) (MOP),Potassium nitrate,Potassium sulphate (sulphate of potash) (SOP),Sodium nitrate,Superphosphates above 35%,"Superphosphates, other",Urea,Urea and ammonium nitrate solutions (UAN)
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,8.56,3244.57,28.34,20.03,67393.32,42464.99,3444.53,105892.68,0.0,163301.41,...,0.26,14176.45,543.43,0.07,23.9,17.85,13448.58,0.0,1616890.03,5.25
Albania,1120.54,1001720.59,1953.95,16113.21,532561.01,18351.73,1245.36,130420.0,0.0,31440.98,...,4094.71,6281.35,2740.46,418.36,3681.6,6431.68,569761.33,38753.0,967101.33,104.8
Algeria,9993103.16,2117203.14,199644.67,167570.44,58572.51,92574.98,257234.52,2933324.71,0.0,146740.93,...,57253.36,26503675.84,106605.88,71239.21,692694.37,3745.73,869833.8,131825.0,11192522.82,3587527.8
Angola,493.25,133245.73,270978.84,944.01,3925.1,81258.44,1915.74,599950.71,0.0,10150.54,...,58616.51,1446.86,97073.91,1843.15,3097.75,633.98,6881.57,650.44,148596.06,2173.38
Antigua and Barbuda,44.96,369.22,25.51,0.05,82.76,1856.54,0.02,4000.68,0.0,103.18,...,329.88,5.03,0.43,1.2,25.32,0.0,0.01,1.0,42.74,0.01


In [3]:
#Apply - works on columns/rows to apply mathematical functions
#Must be a uniform application

for i in range(iterations):
    dfapply = df.copy(deep=False)

    start = time.time()


    dfapply = dfapply[['Area','Item','Unit','Value']]
    dfapply = dfapply[dfapply['Unit'].apply(lambda x: x=='tonnes')]
    dfapply.drop(columns=['Unit'], inplace=True)
    dfapply = dfapply.groupby(['Area','Item'], as_index=False)['Value'].sum()
    dfapply = dfapply.pivot(index='Area', columns='Item', values='Value')
    dfapply = dfapply.fillna(0.0)

    store_time[i] = time.time()-start


print('Apply took ', np.mean(store_time),' seconds')

final_times['Apply'] = np.mean(store_time)
time_std[1] = np.std(store_time)

dfapply.head()

Apply took  0.20554128646850586  seconds


Item,"Ammonia, anhydrous",Ammonium nitrate (AN),Ammonium sulphate,Calcium ammonium nitrate (CAN) and other mixtures with calcium carbonate,Diammonium phosphate (DAP),Fertilizers n.e.c.,Monoammonium phosphate (MAP),NPK fertilizers,Other NK compounds,Other NP compounds,...,PK compounds,Phosphate rock,Potassium chloride (muriate of potash) (MOP),Potassium nitrate,Potassium sulphate (sulphate of potash) (SOP),Sodium nitrate,Superphosphates above 35%,"Superphosphates, other",Urea,Urea and ammonium nitrate solutions (UAN)
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,8.56,3244.57,28.34,20.03,67393.32,42464.99,3444.53,105892.68,0.0,163301.41,...,0.26,14176.45,543.43,0.07,23.9,17.85,13448.58,0.0,1616890.03,5.25
Albania,1120.54,1001720.59,1953.95,16113.21,532561.01,18351.73,1245.36,130420.0,0.0,31440.98,...,4094.71,6281.35,2740.46,418.36,3681.6,6431.68,569761.33,38753.0,967101.33,104.8
Algeria,9993103.16,2117203.14,199644.67,167570.44,58572.51,92574.98,257234.52,2933324.71,0.0,146740.93,...,57253.36,26503675.84,106605.88,71239.21,692694.37,3745.73,869833.8,131825.0,11192522.82,3587527.8
Angola,493.25,133245.73,270978.84,944.01,3925.1,81258.44,1915.74,599950.71,0.0,10150.54,...,58616.51,1446.86,97073.91,1843.15,3097.75,633.98,6881.57,650.44,148596.06,2173.38
Antigua and Barbuda,44.96,369.22,25.51,0.05,82.76,1856.54,0.02,4000.68,0.0,103.18,...,329.88,5.03,0.43,1.2,25.32,0.0,0.01,1.0,42.74,0.01


In [4]:
#List comprehension - create/manipulate lists in a single line modified for loop
#Can become incomprehensible if too long

for i in range(iterations):
    dflist = pd.DataFrame(index=np.unique(df['Area']),columns=np.unique(df['Item']))

    start = time.time()


    total = dict()
    combined = [list(row) for row in zip(df['Area'],df['Item'],df['Unit'],df['Value']) if row[2]=='tonnes']

    for item in combined:
        item = '_'.join(item[0:3])
        total[item] = 0

    for item in combined:
        key = '_'.join(item[0:3])
        if isinstance(item[3], float):
            total[key] += item[3]

    for k,v in total.items():
        items = k.split('_')
        dflist.at[items[0],items[1]] = v

    dflist = dflist.fillna(0.0)

    store_time[i] = time.time()-start


print('List comprehension took ',np.mean(store_time),' seconds')

final_times['List'] = np.mean(store_time)
time_std[2] = np.std(store_time)

dflist.head()

List comprehension took  0.659500675201416  seconds


Unnamed: 0,"Ammonia, anhydrous",Ammonium nitrate (AN),Ammonium sulphate,Calcium ammonium nitrate (CAN) and other mixtures with calcium carbonate,Diammonium phosphate (DAP),Fertilizers n.e.c.,Monoammonium phosphate (MAP),NPK fertilizers,Other NK compounds,Other NP compounds,...,PK compounds,Phosphate rock,Potassium chloride (muriate of potash) (MOP),Potassium nitrate,Potassium sulphate (sulphate of potash) (SOP),Sodium nitrate,Superphosphates above 35%,"Superphosphates, other",Urea,Urea and ammonium nitrate solutions (UAN)
Afghanistan,8.56,3244.57,28.34,20.03,67393.32,42464.99,3444.53,105892.68,0.0,163301.41,...,0.26,14176.45,543.43,0.07,23.9,17.85,13448.58,0.0,1616890.03,5.25
Albania,1120.54,1001720.59,1953.95,16113.21,532561.01,18351.73,1245.36,130420.0,0.0,31440.98,...,4094.71,6281.35,2740.46,418.36,3681.6,6431.68,569761.33,38753.0,967101.33,104.8
Algeria,9993103.16,2117203.14,199644.67,167570.44,58572.51,92574.98,257234.52,2933324.71,0.0,146740.93,...,57253.36,26503675.84,106605.88,71239.21,692694.37,3745.73,869833.8,131825.0,11192522.82,3587527.8
Angola,493.25,133245.73,270978.84,944.01,3925.1,81258.44,1915.74,599950.71,0.0,10150.54,...,58616.51,1446.86,97073.91,1843.15,3097.75,633.98,6881.57,650.44,148596.06,2173.38
Antigua and Barbuda,44.96,369.22,25.51,0.05,82.76,1856.54,0.02,4000.68,0.0,103.18,...,329.88,5.03,0.43,1.2,25.32,0.0,0.01,1.0,42.74,0.01


In [5]:
#Cython - uses C framework to implement Python code
#Does not perform as well if iterating through to access the file
#Not easy to test without building pyx file each time

#Create/modify setup.py and build in terminal using "python setup.py build_ext --inplace"

import fertilizerprocessor


for i in range(iterations):
    dfcython = pd.DataFrame(index=np.unique(df['Area']),columns=np.unique(df['Item']))

    start = time.time()

    dataframedict = fertilizerprocessor.fertilizerCount(df['Area'].tolist(), df['Item'].tolist(), df['Unit'].tolist(), df['Value'].tolist())


    for k,v in dataframedict.items():
        items = k.split('_')
        dfcython.at[items[0],items[1]] = v

    dfcython = dfcython.fillna(0.0)

    store_time[i] = time.time()-start


print('Cython took: ',np.mean(store_time),' seconds')

final_times['Cython'] = np.mean(store_time)
time_std[3] = np.std(store_time)

dfcython.head()



Cython took:  0.37970093250274656  seconds


Unnamed: 0,"Ammonia, anhydrous",Ammonium nitrate (AN),Ammonium sulphate,Calcium ammonium nitrate (CAN) and other mixtures with calcium carbonate,Diammonium phosphate (DAP),Fertilizers n.e.c.,Monoammonium phosphate (MAP),NPK fertilizers,Other NK compounds,Other NP compounds,...,PK compounds,Phosphate rock,Potassium chloride (muriate of potash) (MOP),Potassium nitrate,Potassium sulphate (sulphate of potash) (SOP),Sodium nitrate,Superphosphates above 35%,"Superphosphates, other",Urea,Urea and ammonium nitrate solutions (UAN)
Afghanistan,8.56,3244.57,28.34,20.03,67393.32,42464.99,3444.53,105892.68,0.0,163301.41,...,0.26,14176.45,543.43,0.07,23.9,17.85,13448.58,0.0,1616890.03,5.25
Albania,1120.54,1001720.59,1953.95,16113.21,532561.01,18351.73,1245.36,130420.0,0.0,31440.98,...,4094.71,6281.35,2740.46,418.36,3681.6,6431.68,569761.33,38753.0,967101.33,104.8
Algeria,9993103.16,2117203.14,199644.67,167570.44,58572.51,92574.98,257234.52,2933324.71,0.0,146740.93,...,57253.36,26503675.84,106605.88,71239.21,692694.37,3745.73,869833.8,131825.0,11192522.82,3587527.8
Angola,493.25,133245.73,270978.84,944.01,3925.1,81258.44,1915.74,599950.71,0.0,10150.54,...,58616.51,1446.86,97073.91,1843.15,3097.75,633.98,6881.57,650.44,148596.06,2173.38
Antigua and Barbuda,44.96,369.22,25.51,0.05,82.76,1856.54,0.02,4000.68,0.0,103.18,...,329.88,5.03,0.43,1.2,25.32,0.0,0.01,1.0,42.74,0.01


In [6]:
#Itertuples - creates tuples of each row which can then be accessed
#Iterates through each row which is quite inefficient

for i in range(iterations):
    dftuples = pd.DataFrame(index=np.unique(df['Area']), columns=np.unique(df['Item']))

    start = time.time()

    dftuples = dftuples.fillna(0.0)

    for row in df.itertuples(index=False, name='Row'):
        if row.Unit=='tonnes':
             dftuples.at[row.Area, row.Item] += row.Value

    store_time[i] = time.time()-start

print('Itertuples took ',np.mean(store_time),' seconds')

final_times['Itertuples'] = np.mean(store_time)
time_std[4] = np.std(store_time)

dftuples.head()

Itertuples took  4.277632451057434  seconds


Unnamed: 0,"Ammonia, anhydrous",Ammonium nitrate (AN),Ammonium sulphate,Calcium ammonium nitrate (CAN) and other mixtures with calcium carbonate,Diammonium phosphate (DAP),Fertilizers n.e.c.,Monoammonium phosphate (MAP),NPK fertilizers,Other NK compounds,Other NP compounds,...,PK compounds,Phosphate rock,Potassium chloride (muriate of potash) (MOP),Potassium nitrate,Potassium sulphate (sulphate of potash) (SOP),Sodium nitrate,Superphosphates above 35%,"Superphosphates, other",Urea,Urea and ammonium nitrate solutions (UAN)
Afghanistan,8.56,3244.57,28.34,20.03,67393.32,42464.99,3444.53,105892.68,0.0,163301.41,...,0.26,14176.45,543.43,0.07,23.9,17.85,13448.58,0.0,1616890.03,5.25
Albania,1120.54,1001720.59,1953.95,16113.21,532561.01,18351.73,1245.36,130420.0,0.0,31440.98,...,4094.71,6281.35,2740.46,418.36,3681.6,6431.68,569761.33,38753.0,967101.33,104.8
Algeria,9993103.16,2117203.14,199644.67,167570.44,58572.51,92574.98,257234.52,2933324.71,0.0,146740.93,...,57253.36,26503675.84,106605.88,71239.21,692694.37,3745.73,869833.8,131825.0,11192522.82,3587527.8
Angola,493.25,133245.73,270978.84,944.01,3925.1,81258.44,1915.74,599950.71,0.0,10150.54,...,58616.51,1446.86,97073.91,1843.15,3097.75,633.98,6881.57,650.44,148596.06,2173.38
Antigua and Barbuda,44.96,369.22,25.51,0.05,82.76,1856.54,0.02,4000.68,0.0,103.18,...,329.88,5.03,0.43,1.2,25.32,0.0,0.01,1.0,42.74,0.01


In [None]:
#Iterrows - iterates through each row
#Inefficient by producing Pandas Series and index from each row

for i in range(iterations):
    dfrows = pd.DataFrame(index=np.unique(df['Area']), columns=np.unique(df['Item']))

    start = time.time()

    dfrows = dfrows.fillna(0.0)

    for index, row in df.iterrows():
        if row['Unit']=='tonnes':
             dfrows.at[row['Area'], row['Item']] += row['Value']

    store_time[i] = time.time()-start

print('Iterrows took ',np.mean(store_time),' seconds')

final_times['Iterrows'] = np.mean(store_time)
time_std[5] = np.std(store_time)

dfrows.head()

In [None]:
dffinal = dfpivot.copy(deep=False)
dffinal.head()

In [None]:
#Get most popular fertilizers/countries
dffinal = dffinal.reindex(dffinal.median().sort_values(ascending=False).index, axis=1)
dffinal = dffinal.T
dffinal = dffinal.reindex(dffinal.median().sort_values(ascending=False).index, axis=1)

dffinal = dffinal.head(10)
dffinal = dffinal.iloc[:,:10]
dffinal = dffinal.T
dffinal.head()

In [None]:
#Get proportion of each fertilizer
dffinal = dffinal.apply(lambda x: x/x.sum()*100, axis = 1)
dffinal.head()

In [None]:
fig = plt.figure(figsize=(30,30))
ax = fig.add_subplot(111)
ax.set_title('Fertilizer Import by Country and Type', fontsize=50)
dffinal.plot(ax=ax,kind='bar',stacked=True)
ax.set_xlabel('Country', fontsize=25)
ax.set_xticklabels(labels=dffinal.index.values.tolist(), fontsize=20, rotation=60)
ax.set_yticklabels(labels=np.arange(0, 101, 20), fontsize=20)
ax.set_ylabel('Percent Import of Fertilizer (%)', fontsize=25)
ax.grid(True, linestyle='dashed', linewidth=0.5)

plt.subplots_adjust(top=0.95)
plt.ylim([0, 100])
leg = plt.legend(dffinal.columns.to_list(),loc=(1,0.5), frameon=False, title='Fertilizers', title_fontsize=20, fontsize=16.5, labelspacing=2.5)
leg._legend_box.align = 'left'
flags = ['USA.png', 'Brazil.png', 'Russia.png', 'France.png', 'China.png', 'Spain.png', 'Mexico.png', 'India.png', 'Netherlands.png', 'Germany.png']

placement = 0.15
for img in flags:
    img = '../data/flags/'+img
    flag_img = mpimg.imread(img)
    imax = fig.add_axes([placement, 0.025, 0.025, 0.025])
    imax.imshow(flag_img)
    imax.axis('off')
    placement+=0.0675

fig.subplots_adjust(right=0.8)
fig.savefig('../output/fertilizerbycountrybarplot.png')

In [None]:
#Get average final time in a dataframe
timedf = pd.DataFrame(final_times.items(), columns=['Type','Time'])
timedf['Std'] = time_std

timedf.head()

In [None]:
sns.set(style='whitegrid')

fig1 = plt.figure(figsize=(30,30))
ax1 = fig1.add_subplot(111)


ax1 = sns.barplot(x='Type', y='Time', data=timedf, palette = sns.color_palette('GnBu'), ci='sd')
plt.errorbar(x=timedf['Type'], y=timedf['Time'], yerr=timedf['Std'], ls='none', elinewidth=5, ecolor='black')

ax1.set_xlabel('Iteration Type', fontsize=25)
ax1.set_ylabel('Mean Time (s)', fontsize=25)
ax1.set_title('Dataframe Iteration Time by Type', fontsize=50)

ax1.tick_params(labelsize=20)

plt.tight_layout()
fig1.savefig('../output/iterationtypetimebarplot.png')

In [None]:
#Remove rows for time

#Vectorization

df = pd.read_csv('../data/FertilizersProduct.csv')
vect_time = np.empty(shape=int(df.shape[0]/divisor), dtype='float64')

counter = 0
while df.shape[0]>divisor:

    dfpivot = df.copy(deep=False)

    start = time.time()

    dfpivot = dfpivot[dfpivot['Unit']=='tonnes'].drop(columns=['Unit'])
    dfpivot = dfpivot.groupby(['Area','Item'], as_index=False)['Value'].sum()
    dfpivot = dfpivot.pivot(index='Area', columns='Item', values='Value')

    dfpivot = dfpivot.fillna(0.0)


    vect_time[counter] = time.time()-start
    df.drop(df.tail(divisor).index, inplace=True)
    counter+=1


df.head()


In [None]:
#Apply

df = pd.read_csv('../data/FertilizersProduct.csv')
apply_time = np.empty(shape=int(df.shape[0]/divisor), dtype='float64')

counter = 0
while df.shape[0]>divisor:

    dfapply = df.copy(deep=False)

    start = time.time()


    dfapply = dfapply[['Area','Item','Unit','Value']]
    dfapply = dfapply[dfapply['Unit'].apply(lambda x: x=='tonnes')]
    dfapply.drop(columns=['Unit'], inplace=True)
    dfapply = dfapply.groupby(['Area','Item'], as_index=False)['Value'].sum()
    dfapply = dfapply.pivot(index='Area', columns='Item', values='Value')
    dfapply = dfapply.fillna(0.0)

    apply_time[counter] = time.time()-start
    df.drop(df.tail(divisor).index, inplace=True)
    counter+=1

df.head()


In [None]:
#List comprehension

df = pd.read_csv('../data/FertilizersProduct.csv')
list_time = np.empty(shape=int(df.shape[0]/divisor), dtype='float64')

counter = 0
while df.shape[0]>divisor:

    dflist = pd.DataFrame(index=np.unique(df['Area']),columns=np.unique(df['Item']))

    start = time.time()


    total = dict()
    combined = [list(row) for row in zip(df['Area'],df['Item'],df['Unit'],df['Value']) if row[2]=='tonnes']

    for item in combined:
        item = '_'.join(item[0:3])
        total[item] = 0

    for item in combined:
        key = '_'.join(item[0:3])
        if isinstance(item[3], float):
            total[key] += item[3]

    for k,v in total.items():
        items = k.split('_')
        dflist.at[items[0],items[1]] = v

    dflist = dflist.fillna(0.0)

    list_time[counter] = time.time()-start
    df.drop(df.tail(divisor).index, inplace=True)
    counter+=1

df.head()


In [None]:
#Cython

import fertilizerprocessor

df = pd.read_csv('../data/FertilizersProduct.csv')
cython_time = np.empty(shape=int(df.shape[0]/divisor), dtype='float64')

counter = 0
while df.shape[0]>divisor:
    dfcython = pd.DataFrame(index=np.unique(df['Area']),columns=np.unique(df['Item']))

    start = time.time()

    dataframedict = fertilizerprocessor.fertilizerCount(df['Area'].tolist(), df['Item'].tolist(), df['Unit'].tolist(), df['Value'].tolist())


    for k,v in dataframedict.items():
        items = k.split('_')
        dfcython.at[items[0],items[1]] = v

    dfcython = dfcython.fillna(0.0)

    cython_time[counter] = time.time()-start
    df.drop(df.tail(divisor).index, inplace=True)
    counter+=1

df.head()


In [None]:
#Itertuples

df = pd.read_csv('../data/FertilizersProduct.csv')
itertuples_time = np.empty(shape=int(df.shape[0]/divisor), dtype='float64')

counter=0
while df.shape[0]>divisor:
    dftuples = pd.DataFrame(index=np.unique(df['Area']), columns=np.unique(df['Item']))

    start = time.time()

    dftuples = dftuples.fillna(0.0)

    for row in df.itertuples(index=False, name='Row'):
        if row.Unit=='tonnes':
             dftuples.at[row.Area, row.Item] += row.Value

    itertuples_time[counter] = time.time()-start
    df.drop(df.tail(divisor).index, inplace=True)
    counter+=1

df.head()



In [None]:
#Iterrows

df = pd.read_csv('../data/FertilizersProduct.csv')
iterrows_time = np.empty(shape=int(df.shape[0]/divisor), dtype='float64')

counter=0
while df.shape[0]>divisor:
    dfrows = pd.DataFrame(index=np.unique(df['Area']), columns=np.unique(df['Item']))

    start = time.time()

    dfrows = dfrows.fillna(0.0)

    for index, row in df.iterrows():
        if row['Unit']=='tonnes':
             dfrows.at[row['Area'], row['Item']] += row['Value']

    iterrows_time[counter] = time.time()-start
    df.drop(df.tail(divisor).index, inplace=True)
    counter+=1


df.head()

In [None]:
timeeqdf = pd.DataFrame(list(zip(vect_time, apply_time, list_time, cython_time, itertuples_time, iterrows_time)), columns=['Vect','Apply','List','Cython','Itertuples','Iterrows'])
timeeqdf.head()

In [None]:
timeeqdf = pd.read_csv('../../dumpfiles/dfiteritems_eqtime.csv')

In [None]:
timeeqdf = timeeqdf.reindex(index=timeeqdf.index[::-1])
timeeqdf.head()

In [None]:
timeeqdf.reset_index(inplace=True)
timeeqdf.head()

In [None]:
timeeqdf = pd.melt(timeeqdf, ['index'])

In [None]:
#Output stats of graph
iter_types = ['Vect','Apply','List','Cython','Itertuples','Iterrows']
statsdf = pd.DataFrame(columns=['type','slope','intercept','r_value','p_value','std_err'])

for i, type in enumerate(iter_types):
    slope, intercept, r_value, p_value, std_err = stats.linregress(timeeqdf.loc[timeeqdf['variable']==type, 'index'], timeeqdf.loc[timeeqdf['variable']==type, 'value'])
    statsdf.loc[i] = [type, slope, intercept, r_value, p_value, std_err]

statsdf.to_csv('../output/iterationtypetimelinestats.csv', index=False)

In [None]:
fig2 = plt.figure(figsize=(30,30))
ax2 = fig2.add_subplot(111)


sns.lineplot(x='index', y='value', hue='variable', data=timeeqdf, ax=ax2)

ax2.set_title('Dataframe Iteration Time Complexity', fontsize=50)
ax2.set_xlabel('Number of Rows (/'+divisor+')', fontsize=25)
ax2.set_ylabel('Time (s)', fontsize=25)
ax2.tick_params(labelsize=20)
ax2.grid(True, linestyle='dashed', linewidth=0.5)

legend = ax2.legend()
legend.texts[0].set_text('Types')

plt.xlim(0, None)
plt.ylim(0, None)
plt.tight_layout()
plt.setp(ax2.get_legend().get_title(), fontsize='30')
plt.setp(ax2.get_legend().get_texts(), fontsize='25')

fig2.show()
fig2.savefig('../output/iterationtypetimelineplot.png')