In [1]:
import numpy as np
import pandas as pd

### Importação do Conjunto de Dados

In [2]:
df = pd.read_csv('agricultural_raw_material.csv')

### Explorando o Conjunto de Dados

In [3]:
df.info
#Verificando valores nulos em cada coluna
df.isnull().sum()

Month                            0
Coarse wool Price               34
Coarse wool price % Change      34
Copra Price                     22
Copra price % Change            22
Cotton Price                     0
Cotton price % Change            0
Fine wool Price                 34
Fine wool price % Change        34
Hard log Price                   0
Hard log price % Change          0
Hard sawnwood Price             34
Hard sawnwood price % Change    34
Hide Price                      34
Hide price % change             34
Plywood Price                    0
Plywood price % Change           0
Rubber Price                     0
Rubber price % Change            0
Softlog Price                   34
Softlog price % Change          34
Soft sawnwood Price             34
Soft sawnwood price % Change    34
Wood pulp Price                  1
Wood pulp price % Change         1
dtype: int64

### Lidar com dados ausentes, incorretos e inválidos

In [4]:
# Substituindo %, "," e "-"
df = df.replace('%', '', regex=True)
df = df.replace(',', '', regex=True)
df = df.replace('-', '', regex=True)
df = df.replace('', np.nan)
df = df.replace('MAY90', np.nan)

In [5]:
# Eliminando linhas com valores NaN
df = df.dropna()

In [6]:
# Verifica se todos os valores NaN foram resolvidos
df.isnull().sum()

Month                           0
Coarse wool Price               0
Coarse wool price % Change      0
Copra Price                     0
Copra price % Change            0
Cotton Price                    0
Cotton price % Change           0
Fine wool Price                 0
Fine wool price % Change        0
Hard log Price                  0
Hard log price % Change         0
Hard sawnwood Price             0
Hard sawnwood price % Change    0
Hide Price                      0
Hide price % change             0
Plywood Price                   0
Plywood price % Change          0
Rubber Price                    0
Rubber price % Change           0
Softlog Price                   0
Softlog price % Change          0
Soft sawnwood Price             0
Soft sawnwood price % Change    0
Wood pulp Price                 0
Wood pulp price % Change        0
dtype: int64

In [7]:
# Convertendo tipo de dados para float
lst = ["Coarse wool Price", "Coarse wool price % Change", "Copra Price", "Copra price % Change", "Cotton price % Change","Fine wool Price", "Fine wool price % Change", "Hard log price % Change", "Hard sawnwood price % Change", "Hide price % change", "Plywood price % Change", "Rubber price % Change", "Softlog price % Change", "Soft sawnwood price % Change", "Wood pulp price % Change"]
df[lst] = df[lst].astype("float")
df.dtypes

Month                            object
Coarse wool Price               float64
Coarse wool price % Change      float64
Copra Price                     float64
Copra price % Change            float64
Cotton Price                    float64
Cotton price % Change           float64
Fine wool Price                 float64
Fine wool price % Change        float64
Hard log Price                  float64
Hard log price % Change         float64
Hard sawnwood Price             float64
Hard sawnwood price % Change    float64
Hide Price                      float64
Hide price % change             float64
Plywood Price                   float64
Plywood price % Change          float64
Rubber Price                    float64
Rubber price % Change           float64
Softlog Price                   float64
Softlog price % Change          float64
Soft sawnwood Price             float64
Soft sawnwood price % Change    float64
Wood pulp Price                 float64
Wood pulp price % Change        float64


In [8]:
df.head()

Unnamed: 0,Month,Coarse wool Price,Coarse wool price % Change,Copra Price,Copra price % Change,Cotton Price,Cotton price % Change,Fine wool Price,Fine wool price % Change,Hard log Price,...,Plywood Price,Plywood price % Change,Rubber Price,Rubber price % Change,Softlog Price,Softlog price % Change,Soft sawnwood Price,Soft sawnwood price % Change,Wood pulp Price,Wood pulp price % Change
1,May90,447.26,7.27,234.0,0.85,1.89,3.28,1057.18,1.35,172.86,...,350.12,12.09,0.85,1.19,124.28,3.0,213.0,2.63,842.51,1.59
2,Jun90,440.99,1.4,216.0,7.69,1.99,5.29,898.24,15.03,181.67,...,373.94,6.8,0.85,0.0,129.45,4.16,200.0,6.1,831.35,1.32
3,Jul90,418.44,5.11,205.0,5.09,2.01,1.01,895.83,0.27,187.96,...,378.48,1.21,0.86,1.18,124.23,4.03,210.05,5.03,798.83,3.91
4,Aug90,418.44,0.0,198.0,3.41,1.79,10.95,951.22,6.18,186.13,...,364.6,3.67,0.88,2.33,129.7,4.4,208.3,0.83,818.74,2.49
5,Sep90,412.18,1.5,196.0,1.01,1.79,0.0,936.77,1.52,185.33,...,384.92,5.57,0.9,2.27,129.78,0.06,199.59,4.18,811.62,0.87


### Colunas de Data e Horacolunas 
##### Formatando a coluna Datetime e definindo-a como índice para conjunto de dados

In [9]:
df.Month = pd.to_datetime(df.Month.str.upper(), format= '%b%y', yearfirst=False)
# Indexing Month
df = df.set_index('Month')
df.head()

Unnamed: 0_level_0,Coarse wool Price,Coarse wool price % Change,Copra Price,Copra price % Change,Cotton Price,Cotton price % Change,Fine wool Price,Fine wool price % Change,Hard log Price,Hard log price % Change,...,Plywood Price,Plywood price % Change,Rubber Price,Rubber price % Change,Softlog Price,Softlog price % Change,Soft sawnwood Price,Soft sawnwood price % Change,Wood pulp Price,Wood pulp price % Change
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-05-01,447.26,7.27,234.0,0.85,1.89,3.28,1057.18,1.35,172.86,7.23,...,350.12,12.09,0.85,1.19,124.28,3.0,213.0,2.63,842.51,1.59
1990-06-01,440.99,1.4,216.0,7.69,1.99,5.29,898.24,15.03,181.67,5.1,...,373.94,6.8,0.85,0.0,129.45,4.16,200.0,6.1,831.35,1.32
1990-07-01,418.44,5.11,205.0,5.09,2.01,1.01,895.83,0.27,187.96,3.46,...,378.48,1.21,0.86,1.18,124.23,4.03,210.05,5.03,798.83,3.91
1990-08-01,418.44,0.0,198.0,3.41,1.79,10.95,951.22,6.18,186.13,0.97,...,364.6,3.67,0.88,2.33,129.7,4.4,208.3,0.83,818.74,2.49
1990-09-01,412.18,1.5,196.0,1.01,1.79,0.0,936.77,1.52,185.33,0.43,...,384.92,5.57,0.9,2.27,129.78,0.06,199.59,4.18,811.62,0.87


#### Análise Exploratória e Visualização

##### Vamos começar impportando matplotlib.pyplot e seaborn

In [10]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline 
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '#00000000'