In [3]:
import pandas as pd
import numpy as np

In [45]:
# Creating an n dimensional array using a
arr1 = np.array([1,2,3,4,5,6,7,8,9])
arr1

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [46]:
arr1.tolist()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

## `pd.Series`: Getting the index series  using Pandas

In [24]:
series = pd.Series(arr1)
series

0    1
1    2
2    3
3    4
4    5
dtype: int32

In [20]:
type(series)

pandas.core.series.Series

In [30]:
# Manipuklating the indices of an array by passing a n index list
series = pd.Series(arr1, index = ['A','B','C','D','E'])
series

A    1
B    2
C    3
D    4
E    5
dtype: int32

In [29]:
series['A']

1

In [33]:
# Another way to do this
series_ = pd.Series({0.1:1, 0.2:2, 0.3:3, 0.4:4, 0.5:5})
series_[0.2]

2

In [None]:
series.argmax()
# WARNING:

"""

C:\Users\Shaikh\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 'argmax' is deprecated, use 'idxmax' instead. The behavior of 'argmax'
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
Entry point for launching an IPython kernel.

"""
 

In [48]:
pd.DataFrame(arr1.reshape(3,3))

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [52]:
# We can also manipulate the index just as we did in Series
df = pd.DataFrame(arr1.reshape(3,3),index= ['First','Second','Third'])
df

Unnamed: 0,0,1,2
First,1,2,3
Second,4,5,6
Third,7,8,9


In [57]:
df.shape
df.index

Index(['First', 'Second', 'Third'], dtype='object')

In [56]:
# We can also change the column label
df = pd.DataFrame(arr1.reshape(3,3),columns = ['arr1','arr2','arr2'],index= ['First','Second','Third'])
df

Unnamed: 0,arr1,arr2,arr2.1
First,1,2,3
Second,4,5,6
Third,7,8,9


# `pd.Series` can also be a `pd.Dataframe`

In [58]:
series_

0.1    1
0.2    2
0.3    3
0.4    4
0.5    5
dtype: int64

In [89]:
import pprint as pp
pp.pprint(df)

        arr1  arr2  arr2
First      1     2     3
Second     4     5     6
Third      7     8     9


In [67]:
pd.DataFrame(series_)

Unnamed: 0,0
0.1,1
0.2,2
0.3,3
0.4,4
0.5,5


In [99]:
# Using Dictionaries
pd.DataFrame({'Name':['Ashhar'],'Surname':['Shaikh'],'Age':['19'],'Gender':['Male']}, index = [' '])

Unnamed: 0,Name,Surname,Age,Gender
,Ashhar,Shaikh,19,Male


In [100]:
pd.DataFrame({'Name':['Ashhar','Faraz'],
              'Surname':['Shaikh','Shaikh'],
              'Age':['19','9'],
              'Gender':['Male','Female']})

Unnamed: 0,Name,Surname,Age,Gender
0,Ashhar,Shaikh,19,Male
1,Faraz,Shaikh,9,Female


In [104]:
#Using Lists
pd.DataFrame(['Ashhar',19,'Shaikh'],
             ['Faraz',9,'Shaikh']
             )

Unnamed: 0,0
Faraz,Ashhar
9,19
Shaikh,Shaikh


In [105]:
pd.DataFrame([{'a': 1, 'b': 2}])

Unnamed: 0,a,b
0,1,2


In [106]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [109]:
pd.DataFrame({'a': [1, np.nan], 
              'b': [2,3],
              'c':[np.nan, 4]})

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


## `.info()`
***
Provides a summary of a DataFrame: rows, columns, data types of columns (if automatically detected) and the memory usage.

For detailed summaries of the the DataFrame, you can pass optional arguments verbose=True and null_counts=True to the .info() method to output information for all of the columns

In [110]:
pd.DataFrame({'a': [1, np.nan], 
              'b': [2,3],
              'c':[np.nan, 4]}).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
a    1 non-null float64
b    2 non-null int64
c    1 non-null float64
dtypes: float64(2), int64(1)
memory usage: 128.0 bytes


***
***
**Creating DataFrames manually: Hierarchical Index**

Hierarchical indexing is a feature of pandas that allows the combined use of two or more indexes per row. Each of the indexes in a hierarchical index is referred to as a level. 

The specification of multiple levels in an index allows for efficient selection of different subsets of data using different combinations of the values at each level. Technically, a pandas index that has multiple levels of hierarchy is referred to as a MultiIndex.

In [115]:
outside = ['G1']*3 + ['G2']*3  # G1 repeated thrice, then G2 repeated thrice
inside = [1, 2, 3, 1, 2, 3]  # 1,2,3 repeated twice

hier_index = list(zip(outside,inside))
print(hier_index)

hier_index = pd.MultiIndex.from_tuples(hier_index)
print(hier_index)

hier_df = pd.DataFrame(np.random.randn(6,2), index=hier_index, columns=['A','B'])

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])


In [122]:
hier_df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.546006,2.671349
G1,2,1.036032,2.012382
G1,3,-0.270572,-0.13549
G2,1,2.436307,-0.419614
G2,2,-0.798683,-0.268983
G2,3,1.43076,-1.086359


# Importing Files

In [123]:
olympics = pd.read_csv('olympics.csv')

In [126]:
olympics.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,# Summer,01 !,02 !,03 !,Total,# Winter,01 !,02 !,03 !,Total,# Games,01 !,02 !,03 !,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
5,Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12
6,Australia (AUS) [AUS] [Z],25,139,152,177,468,18,5,3,4,12,43,144,155,181,480
7,Austria (AUT),26,18,33,35,86,22,59,78,81,218,48,77,111,116,304
8,Azerbaijan (AZE),5,6,5,15,26,5,0,0,0,0,10,6,5,15,26
9,Bahamas (BAH),15,5,2,5,12,0,0,0,0,0,15,5,2,5,12


In [None]:
# Locating the first row

olympics.columns = olympics.loc[0]

olympics.drop([0], inplace = True)


In [141]:
olympics.head()

Unnamed: 0,nan,# Summer,01 !,02 !,03 !,Total,# Winter,01 !.1,02 !.1,03 !.1,Total.1,# Games,01 !.2,02 !.2,03 !.2,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
5,Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


In [133]:
# `describe() gives all the attributes of the data
olympics.describe()

Unnamed: 0,nan,# Summer,01 !,02 !,03 !,Total,# Winter,01 !.1,02 !.1,03 !.1,Total.1,# Games,01 !.2,02 !.2,03 !.2,Combined total
count,147,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148
unique,147,28,55,55,57,76,22,26,25,28,35,45,56,55,64,75
top,Great Britain (GBR) [GBR] [Z],5,0,1,0,1,0,0,0,0,0,11,0,1,0,1
freq,1,17,47,27,23,26,45,109,104,105,101,17,46,26,22,26


In [134]:
olympics.shape

(148, 16)

In [2]:
import pandas as pd 
import numpy as np

weather_df = pd.read_csv("weather_2012.csv")

weather_df.describe()

weather_df.head(10)


Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog
5,2012-01-01 05:00:00,-1.4,-3.3,87,9,6.4,101.27,Fog
6,2012-01-01 06:00:00,-1.5,-3.1,89,7,6.4,101.29,Fog
7,2012-01-01 07:00:00,-1.4,-3.6,85,7,8.0,101.26,Fog
8,2012-01-01 08:00:00,-1.4,-3.6,85,9,8.0,101.23,Fog
9,2012-01-01 09:00:00,-1.3,-3.1,88,15,4.0,101.2,Fog



## `.unique()`
***
This method, which belongs to the `Series` object, can be useful when trying to identify unique values in a column.
- Uniques are returned in order of appearance. 
- It is significantly faster than numpy.unique and includes N/A values

In [6]:
weather_df['Temp (C)'].unique()

# All the unique values in 
weather_df['Weather'].unique()

### The number of unique values is given by `nunique()`

weather_df['Weather'].nunique()

50

In [4]:
weather_df.columns.tolist()

['Date/Time',
 'Temp (C)',
 'Dew Point Temp (C)',
 'Rel Hum (%)',
 'Wind Spd (km/h)',
 'Visibility (km)',
 'Stn Press (kPa)',
 'Weather']

In [None]:
weather_df['Weather'].value_counts()

weather_df['Temp (C)'].value_counts()
# Method Chaining
weather_df['Temp (C)'].value_counts().sum()

weather_df['Temp (C)']==16.6

# Set of values that are 16.6
weather_df_16_6 = weather_df[weather_df['Temp (C)']==16.6]
weather_df_16_6.head()

weather_df_16_6.shape

# Number of times that the weather was exactly clear
weather_df_Clear = weather_df[weather_df['Weather']=='Clear']
weather_df_Clear.head()

weather_df_Clear['Weather'].value_counts()
# ANSWER = 1326

weather_df['Weather'].value_counts()['Clear']

weather_df_Clear.shape

# Subset where weather is Clear or Foggy
weather_df[(weather_df['Weather'] == 'Clear')|( weather_df['Weather'] == 'Fog')].head()





# Create a subset where Wind Speed is > 4

weather_Wind_Spd_4 = weather_df[weather_df['Wind Spd (km/h)']>4]
weather_Wind_Spd_4.head()

weather_Wind_Spd_4['Wind Spd (km/h)'].value_counts().head()

# To get specific comlumns
weather_df[['DATE / TIME','Temp (C)','Weather']]

### Renaming the headers using

weather_df.rename({Date})

weather_df.rename(columns = {'Date/Time':'DATE/TIME'},inplace=True)


weather_df.head(10)

# loc is label bases
# it takes arguments as rows , columns where column argument may not be int always

weather_df.loc[2:4,['Dew Point Temp (C)','Weather']]



# The first 5 pressure values recorded on Jan 6
# Convert the Date/Time column into Time Format

weather_df['DATE / TIME'] = pd.to_datetime(weather_df['DATE / TIME'])

weather_df.loc[0, 'DATE / TIME']

# The first 5 pressure values recorded on Jan 6

weather_df['Month'] = weather_df['DATE / TIME'].dt.month

weather_df['Day'] = weather_df['DATE / TIME'].dt.day


weather_df.head()

weather_df[(weather_df['Day'] == 6) & (weather_df['Month']== 1)].head(5)
# THIS IS THE ANSWER FOR Jan 6 Pressure VALUES

weather_df['Quarter'] = weather_df['DATE / TIME'].dt.quarter
weather_df.head()

weather_df['quarter'].value_counts()[2]

weather_df[weather_df['quarter']==2].shape[0]

weather_df = weather_df.rename(columns = {'Temp (C)':'Temperature'},inplace = True)

weather_df = weather_df.rename(columns = {'Rel Hum (%)':'relative_humidity'}, inplace = True)

# `df.query`
## Convert all the header spaces to underscores to use query otherwise it wont work

weather_df.query('Weather == "Cloudy"').head()

weather_df['Visibility (km)']

weather_df.to_csv('../data/')

# TOP 10 Hottest Places
sorted_temp = weather_df2.sort_values(['Temp (C)'],ascending = False)
sorted_temp.head(10)

