# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [9]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [10]:
path = ("../../Resources/stock_data.csv")
data = pd.read_csv(path)


### Identify the number of rows and columns (shape) in the DataFrame.

In [11]:
data.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [12]:
data.head()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.49,138721100000.0,9048000000.0,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
1,AOS,A.O. Smith Corp,Industrials,,,,,,,,,,,
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121000000.0,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.49956,3.29,125.86,60.05,181386300000.0,10310000000.0,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
4,ATVI,Activision Blizzard,Information Technology,65.83,,0.431903,1.28,74.945,38.93,52518670000.0,2704000000.0,10.59512,5.16,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [13]:
data.count()

symbol                504
name                  502
sector                501
price                 500
price_per_earnings    497
dividend_yield        499
earnings_per_share    498
52_week_low           500
52_week_high          500
market_cap            500
ebitda                492
price_per_sales       500
price_per_book        492
sec_filings           500
dtype: int64

### Identify nulls records

In [31]:
data.isnull()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,False,False,False,False,False,False,False,False,False,False,False,False,False,False
500,False,False,False,False,False,False,False,False,False,False,False,False,False,False
501,False,False,False,False,False,False,False,False,False,False,False,False,False,False
502,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Drop Null Records

In [20]:
data = data.dropna()

### Validate nulls have been dropped

In [26]:
data.isnull().mean()


symbol                0.0
name                  0.0
sector                0.0
price                 0.0
price_per_earnings    0.0
dividend_yield        0.0
earnings_per_share    0.0
52_week_low           0.0
52_week_high          0.0
market_cap            0.0
ebitda                0.0
price_per_sales       0.0
price_per_book        0.0
sec_filings           0.0
dtype: float64

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

In [27]:
data['ebitda'] = data['ebitda'].fillna(0)
data['ebitda'].isnull().sum()

0

### Drop Duplicates

In [32]:
data = data.drop_duplicates()

### Sample `price` field

In [33]:
data["price"].head()

0    $222.89
2      56.27
3     108.48
5     108.48
6     185.16
Name: price, dtype: object

### Clean `price` Series by replacing `$`

In [None]:
data["price"] = data["price"].str.replace('$',"")
data['price'].isnull()
data['price'].dropna().copy()
data['price']



### Confirm data type of `price`

In [None]:
data['price'].dtype

### Cast `price` Series as float and then validate using `dtype`

In [None]:
data['price'] = data['price'].astype("float")
data['price'].dtype