# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [1]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [14]:
csv_path = Path("../../Resources/stock_data.csv")
df = pd.read_csv(csv_path)
df.sample(5)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
184,XOM,Exxon Mobil Corp.,Energy,76.07,21.37,4.00312,1.88,89.3,76.05,326148700000.0,39052000000.0,1.770194,1.85,http://www.sec.gov/cgi-bin/browse-edgar?action...
98,CTL,CenturyLink Inc,Telecommunication Services,16.2,8.35,12.661196,1.16,27.61,13.161,18237200000.0,,1.479598,1.39,http://www.sec.gov/cgi-bin/browse-edgar?action...
308,MCHP,Microchip Technology,Information Technology,79.9,21.77,1.751297,0.66,99.17,69.76,19393100000.0,997492000.0,7.447518,5.9,http://www.sec.gov/cgi-bin/browse-edgar?action...
26,GOOGL,Alphabet Inc Class A,Information Technology,1007.71,31.48,0.0,22.27,1198.0,824.3,733824000000.0,34217000000.0,6.801692,4.7,http://www.sec.gov/cgi-bin/browse-edgar?action...
414,SLG,SL Green Realty,Real Estate,90.61,14.07,3.499892,0.88,115.34,91.2,8617714000.0,795889000.0,6.582008,1.32,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of rows and columns (shape) in the DataFrame.

In [4]:
df.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [6]:
df.sample(10)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
304,MET,MetLife Inc.,Financials,44.28,8.52,3.458712,0.63,56.58,44.58,48679360000.0,0.0,1.049686,0.85,http://www.sec.gov/cgi-bin/browse-edgar?action...
131,CCI,Crown Castle International Corp.,Real Estate,103.81,21.45,3.862069,1.02,114.97,86.93,44183020000.0,2292600000.0,10.125291,3.54,http://www.sec.gov/cgi-bin/browse-edgar?action...
134,CMI,Cummins Inc.,Industrials,165.73,16.83,2.500868,8.23,194.18,143.8301,28669230000.0,2924000000.0,1.940617,3.89,http://www.sec.gov/cgi-bin/browse-edgar?action...
494,WYNN,Wynn Resorts Ltd,Consumer Discretionary,169.28,31.7,1.127904,7.27,203.63,92.67,18225400000.0,1501301000.0,2.478658,51.69,http://www.sec.gov/cgi-bin/browse-edgar?action...
290,M,Macy's Inc.,Consumer Discretionary,24.0,7.67,6.098546,1.98,33.73,17.405,7541063000.0,2446000000.0,0.455032,1.69,http://www.sec.gov/cgi-bin/browse-edgar?action...
103,CHK,Chesapeake Energy,Energy,2.82,4.7,0.0,-6.44,6.59,2.8,2626102000.0,1470000000.0,0.407652,1.84,http://www.sec.gov/cgi-bin/browse-edgar?action...
52,APTV,Aptiv Plc,Consumer Discretionary,89.27,69.74,0.939268,5.05,96.91,82.97,24906530000.0,2370000000.0,1.50258,7.56,http://www.sec.gov/cgi-bin/browse-edgar?action...
400,RCL,Royal Caribbean Cruises Ltd,Consumer Discretionary,122.45,16.26,1.867414,7.53,135.65,93.4,27418150000.0,2876309000.0,3.102661,2.56,http://www.sec.gov/cgi-bin/browse-edgar?action...
388,O,Realty Income Corporation,Real Estate,47.56,15.54,5.372036,1.12,63.6,48.89,13784940000.0,1075568000.0,15.588069,1.92,http://www.sec.gov/cgi-bin/browse-edgar?action...
234,HRL,Hormel Foods Corp.,Consumer Staples,32.21,20.39,2.290776,1.57,38.0,29.75,17338610000.0,1422305000.0,2.517479,3.49,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [7]:
df.count()

symbol                504
name                  502
sector                501
price                 500
price_per_earnings    497
dividend_yield        499
earnings_per_share    498
52_week_low           500
52_week_high          500
market_cap            500
ebitda                492
price_per_sales       500
price_per_book        492
sec_filings           500
dtype: int64

### Identify nulls records

In [9]:
df.isnull().sum()

symbol                 0
name                   2
sector                 3
price                  4
price_per_earnings     7
dividend_yield         5
earnings_per_share     6
52_week_low            4
52_week_high           4
market_cap             4
ebitda                12
price_per_sales        4
price_per_book        12
sec_filings            4
dtype: int64

### Drop Null Records

In [10]:
df = df.dropna().copy()

### Validate nulls have been dropped

In [12]:
df = df.dropna().copy()

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

In [17]:
df['ebitda'].fillna(0,inplace=True)

### Drop Duplicates

In [18]:
new_df = df.drop_duplicates().copy()

In [22]:
new_df.head()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.49,138721100000.0,9048000000.0,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
1,AOS,A.O. Smith Corp,Industrials,,,,,,,,0.0,,,
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121000000.0,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.49956,3.29,125.86,60.05,181386300000.0,10310000000.0,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
4,ATVI,Activision Blizzard,Information Technology,65.83,,0.431903,1.28,74.945,38.93,52518670000.0,2704000000.0,10.59512,5.16,http://www.sec.gov/cgi-bin/browse-edgar?action...
