<h1>Exploratory Data Analysis with Data Frames</h1>

In [19]:
import numpy as np
import pandas as pd

## Task 1: Read in the first file

In [3]:
df = pd.read_csv(r"C:\Users\Lahari Reddy\Desktop\GoogleDA\Exploratory_Data_Analysis\epa_ca_tx_pa.csv")

# 2. ### YOUR CODE HERE ###
df.head()

Unnamed: 0,state_code,state_name,county_code,county_name,aqi
0,6,California,1,Alameda,11.0
1,6,California,7,Butte,6.0
2,6,California,19,Fresno,11.0
3,6,California,29,Kern,7.0
4,6,California,29,Kern,3.0


## Task 2: Summary information

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   state_code   546 non-null    int64  
 1   state_name   546 non-null    object 
 2   county_code  546 non-null    int64  
 3   county_name  546 non-null    object 
 4   aqi          546 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 21.5+ KB


In [5]:
df.describe()

Unnamed: 0,state_code,county_code,aqi
count,546.0,546.0,546.0
mean,20.593407,83.179487,8.906593
std,19.001484,92.240873,9.078479
min,6.0,1.0,0.0
25%,6.0,29.0,3.0
50%,6.0,66.0,6.0
75%,42.0,98.5,11.0
max,48.0,479.0,93.0


## Task 3: Explore your data

In [7]:
df['state_name'].value_counts()

state_name
California      342
Texas           104
Pennsylvania    100
Name: count, dtype: int64

In [8]:
df_sorted = df.sort_values(by='aqi', ascending=False)

df_sorted.head(10)

Unnamed: 0,state_code,state_name,county_code,county_name,aqi
76,6,California,37,Los Angeles,93.0
146,6,California,37,Los Angeles,59.0
41,6,California,83,Santa Barbara,47.0
122,6,California,59,Orange,47.0
184,6,California,59,Orange,47.0
51,48,Texas,141,El Paso,47.0
80,6,California,65,Riverside,43.0
136,48,Texas,141,El Paso,40.0
58,6,California,65,Riverside,40.0
91,48,Texas,141,El Paso,40.0


In [9]:
df_sorted.iloc[10:12]

Unnamed: 0,state_code,state_name,county_code,county_name,aqi
186,6,California,73,San Diego,39.0
74,6,California,37,Los Angeles,38.0


## Task 4: Examine California data

### 4a: Basic Boolean masking

In [10]:
mask = df_sorted['state_name'] == 'California'


ca_df = df_sorted[mask]


ca_df.head()

Unnamed: 0,state_code,state_name,county_code,county_name,aqi
76,6,California,37,Los Angeles,93.0
146,6,California,37,Los Angeles,59.0
41,6,California,83,Santa Barbara,47.0
122,6,California,59,Orange,47.0
184,6,California,59,Orange,47.0


### 4b: Validate CA data

In [11]:
ca_df.shape

(342, 5)

### 4c: Rows per CA county

In [12]:
ca_df['county_name'].value_counts()

county_name
Los Angeles        55
Santa Barbara      26
San Bernardino     21
Orange             19
San Diego          19
Sacramento         17
Alameda            17
Fresno             16
Riverside          14
Contra Costa       13
Imperial           13
San Francisco       8
Monterey            8
Humboldt            8
Santa Clara         7
El Dorado           7
Placer              6
Butte               6
Kern                6
Mendocino           6
Solano              5
San Joaquin         5
Tulare              5
Ventura             5
Sutter              4
San Mateo           4
Marin               3
Sonoma              3
Stanislaus          3
San Luis Obispo     2
Napa                2
Santa Cruz          2
Calaveras           2
Shasta              1
Tuolumne            1
Inyo                1
Yolo                1
Mono                1
Name: count, dtype: int64

### 4d: Calculate mean AQI for Los Angeles county

In [21]:
mask = ca_df['county_name'] == 'Los Angeles'
ca_df[mask]['aqi'].mean()

13.4

## Task 5: Add more data

In [26]:
other_states = pd.read_csv(r"C:\Users\Lahari Reddy\Desktop\GoogleDA\Exploratory_Data_Analysis\epa_others.csv")


other_states.head()

Unnamed: 0,state_code,state_name,county_code,county_name,aqi
0,4,Arizona,13,Maricopa,18.0
1,4,Arizona,13,Maricopa,9.0
2,4,Arizona,19,Pima,20.0
3,8,Colorado,41,El Paso,9.0
4,12,Florida,31,Duval,15.0


## Task 6: Concatenate the data

In [27]:
combined_df = pd.concat([df, other_states], axis=0)


len(combined_df) == len(df) + len(other_states)

True

## Task 7: Complex Boolean masking

In [28]:
mask = (combined_df['state_name'] == 'Washington') & (combined_df['aqi'] >= 51)
combined_df[mask]

Unnamed: 0,state_code,state_name,county_code,county_name,aqi
40,53,Washington,33,King,55.0
82,53,Washington,61,Snohomish,76.0
121,53,Washington,77,Yakima,58.0
122,53,Washington,77,Yakima,57.0


## Conclusion

* It it comes with many built-in functions and tools specifically designed for use with tabular data to simplify common tasks such as:
    * Reading and writing data to/from files
    * Quickly computing summary statistics about your data
    * Manipulating, selecting, and filtering data
    * Grouping and aggregating data
    * Adding new data to existing data

* It's powered by NumPy, which uses the power of array operations to enhance performance.