## 1. Data Aggregation
- Data Aggregation refers to identifying data satisfying a condition.
- It helps in studying one or more aggregated groups together


In [11]:
import pandas as pd
import numpy as np
from urllib import request
myurl='http://aima.cs.berkeley.edu/data/iris.csv'
urlRequest= request.Request(myurl)
iris_file=request.urlopen(urlRequest)

In [12]:
df =pd.read_csv(iris_file,sep=',',header=None,
                decimal='.',
                names=['sepal_length','sepal_width','petal_length','petal_Width','target']
                        )
print(df)

     sepal_length  sepal_width  petal_length  petal_Width     target
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
5             5.4          3.9           1.7          0.4     setosa
6             4.6          3.4           1.4          0.3     setosa
7             5.0          3.4           1.5          0.2     setosa
8             4.4          2.9           1.4          0.2     setosa
9             4.9          3.1           1.5          0.1     setosa
10            5.4          3.7           1.5          0.2     setosa
11            4.8          3.4           1.6          0.2     setosa
12            4.8          3.0           1.4          0.1     setosa
13            4.3          3.0    

In [13]:
print(type(df))
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
(150, 5)


in above we have total 5 columns, of 5 - 4 are **features or attributes**  
and the last one is **target** as the data belongs to it  

##### to know the different values in  target..called **classes**..lets apply set

In [16]:
set(df['target'])

{'setosa', 'versicolor', 'virginica'}

we have total 3 classes for Target - 'setosa', 'versicolor', 'virginica'

##### lets see the total records
- 3 ways

In [21]:
print(df.shape) # gives both rows and columns
print(df.shape[0])#gives rows
print(df.shape[1])#gives columns
print(df.count()) # each column, count
print(df['target'].count()) # each column, count

(150, 5)
150
5
sepal_length    150
sepal_width     150
petal_length    150
petal_Width     150
target          150
dtype: int64
150


##### lets get only setosa columns  - by boolean masking

In [24]:
df['target']=='setosa'

0       True
1       True
2       True
3       True
4       True
5       True
6       True
7       True
8       True
9       True
10      True
11      True
12      True
13      True
14      True
15      True
16      True
17      True
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
       ...  
120    False
121    False
122    False
123    False
124    False
125    False
126    False
127    False
128    False
129    False
130    False
131    False
132    False
133    False
134    False
135    False
136    False
137    False
138    False
139    False
140    False
141    False
142    False
143    False
144    False
145    False
146    False
147    False
148    False
149    False
Name: target, Length: 150, dtype: bool

In [29]:
df_2=df[df['target']=='setosa']
df_2.count() # we got 50 for target so all columns 50

sepal_length    50
sepal_width     50
petal_length    50
petal_Width     50
target          50
dtype: int64

##### lets get the mean,min,max,std length

In [34]:
print(df_2['sepal_length'].mean())
print(df_2['sepal_length'].min())
print(df_2['sepal_length'].max())
print(df_2['sepal_length'].std())
print(df_2['sepal_length'].median())

5.005999999999999
4.3
5.8
0.3524896872134512
5.0


In [33]:
#or all
df_2.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_Width
count,50.0,50.0,50.0,50.0
mean,5.006,3.418,1.464,0.244
std,0.35249,0.381024,0.173511,0.10721
min,4.3,2.3,1.0,0.1
25%,4.8,3.125,1.4,0.2
50%,5.0,3.4,1.5,0.2
75%,5.2,3.675,1.575,0.3
max,5.8,4.4,1.9,0.6


## 2.Data Transformation

In [37]:
df = pd.DataFrame({'temp':pd.Series(28 + 10*np.random.randn(10)),

                   'rain':pd.Series(100 + 50*np.random.randn(10)),

                   'location':list('AAAAABBBBB')

})

print(df)

  location        rain       temp
0        A   50.271035  27.735242
1        A  112.134255  19.980076
2        A   81.872100  23.903548
3        A  106.434611  29.982170
4        A  148.884689  21.126927
5        B   87.323570  48.788428
6        B  128.206311  40.236515
7        B  122.665981  43.984229
8        B   62.086830  29.511422
9        B  180.580154  18.589263


##### lets replace location names from A,B to mumbai, Hyderabad

In [39]:
replacements = {
'location': {'A':'Hyderabad', 'B':'Mumbai'}
}

df = df.replace(replacements, regex=True)
print(df.head(2))

    location        rain       temp
0  Hyderabad   50.271035  27.735242
1  Hyderabad  112.134255  19.980076


##### It is also possible to filter rows, based on a column strings with a specific pattern and modify them.

In [40]:
mumbai_data = df.loc[df.location.str.contains('umb'),:]

print(mumbai_data.head(2))

  location        rain       temp
5   Mumbai   87.323570  48.788428
6   Mumbai  128.206311  40.236515


##### similarly many operations can be done columns having string like data

In [42]:
hyd_data = df.loc[df.location.str.endswith('bad'),:]

print(hyd_data.head(2))

    location        rain       temp
0  Hyderabad   50.271035  27.735242
1  Hyderabad  112.134255  19.980076


## Grouping
one or more of the following steps:
- Splitting the data into groups based on some criteria
- Applying a function to each group independently
- Combining the results into a data structure

**GroupBy in Pandas work similar to sql**

for the above

In [44]:
regions = df.groupby('location')
print(regions.mean())

                 rain       temp
location                        
Hyderabad   99.919338  24.545593
Mumbai     116.172569  36.221971


In [1]:
import pandas as pd
import numpy as np

In [2]:
file_name_string = 'C:/Users/Charles Kelly/Desktop/Exercise Files/02_07/Begin/EmployeesWithGrades.xlsx'
employees_df = pd.read_excel(file_name_string, 'Sheet1', index_col=None, na_values=['NA'])

In [3]:
employees_df

Unnamed: 0,Department,Name,YearsOfService,Grade
0,Marketing,Able,4,a
1,Engineering,Baker,7,b
2,Accounting,Charlie,12,c
3,Marketing,Delta,1,d
4,Engineering,Echo,15,f
5,Accounting,Foxtrot,9,a
6,Marketing,Golf,3,b
7,Engineering,Hotel,1,c
8,Accounting,India,2,d
9,Marketing,Juliet,5,f


### group by Department
documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html

calculate total years of service by employees in each department

In [4]:
employees_df.groupby('Department').sum()

Unnamed: 0_level_0,YearsOfService
Department,Unnamed: 1_level_1
Accounting,47
Engineering,60
Marketing,52
