In [1]:
#A Pandas Sereies is a one-dimensional NumPy-like array, with each element having an index.  
#A series behaves bery much like a dictionary that includes an index.
#To create a series, you first need to import the pandas library and the use the Series Class, as follows:

In [2]:
import pandas as pd


In [3]:
series = pd.Series([1, 2, 3, 4, 5])
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [4]:
#You can specify an optional index for a series using the index parameter, as follows:

In [5]:
series = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
print(series)

a    1
b    2
c    3
d    4
dtype: int64


In [6]:
#Assessing an element in a series is similar to accessing an element in an array.  You can use the position of the element as follows:

In [7]:
print(series[2])

3


In [8]:
print(series.iloc[2])

3


In [9]:
#The iloc allows you to specify an element via its position.

In [10]:
#Alternatively, you can also specify the value of the index of the element you wish to access like this:

In [11]:
print(series['d'])

4


In [12]:
print(series.loc['d'])

4


In [13]:
#You can perform slicing on a series, as follows:

In [14]:
print(series[2:])

c    3
d    4
dtype: int64


In [15]:
#Often, you want to create a timeseries, such as running sequences of dates in a month.  You could use the date_range() function for this purpose, as follows:

In [16]:
dates1 = pd.date_range('20190525', periods=12)
print(dates1)

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03', '2019-06-04', '2019-06-05'],
              dtype='datetime64[ns]', freq='D')


In [17]:
#To assign the range of dates as the index of a series, use the index property of the series as follows:

In [18]:
series = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
series.index = dates1
print(series)

2019-05-25     1
2019-05-26     2
2019-05-27     3
2019-05-28     4
2019-05-29     5
2019-05-30     6
2019-05-31     7
2019-06-01     8
2019-06-02     9
2019-06-03    10
2019-06-04    11
2019-06-05    12
Freq: D, dtype: int64


In [19]:
#Date Ranges.  The periods parameter specifies how many dates you want to create, and the default frequecy is D, for Daily.  
#If you want to change the frequency to month, use the freq parameter and set it to M, as follows:

In [20]:
dates2 = pd.date_range('20190525', periods=12, freq = 'M')
print(dates2)

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'],
              dtype='datetime64[ns]', freq='M')


In [21]:
#You can specify the date as follows:

In [22]:
dates3 = pd.date_range('05-01-2019', periods=12, freq = 'MS')
print(dates3)

DatetimeIndex(['2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
              dtype='datetime64[ns]', freq='MS')


In [23]:
#In addition, you can also set the time, as follows:

In [24]:
dates4 = pd.date_range('2019/05/17 09:00:00', periods=8, freq='H')
print(dates4)

DatetimeIndex(['2019-05-17 09:00:00', '2019-05-17 10:00:00',
               '2019-05-17 11:00:00', '2019-05-17 12:00:00',
               '2019-05-17 13:00:00', '2019-05-17 14:00:00',
               '2019-05-17 15:00:00', '2019-05-17 16:00:00'],
              dtype='datetime64[ns]', freq='H')


In [25]:
#DataFrames is very useful in the world of data science and machine learning, as it closely mirrors how data is stored ni real life.

In [26]:
#Creating a DataFrame

In [27]:
#You can create a Pandas DataFrame using the DataFrame() classic:

In [28]:
import pandas as pd
import numpy as np

In [29]:
df = pd.DataFrame(np.random.randn(10, 4),
                  columns=list('ABCD'))
print(df)

          A         B         C         D
0 -0.403085  2.073766 -2.612140  0.343401
1  0.478953 -1.461931  0.596474  0.223021
2 -0.149036 -0.637716  0.044890 -0.818173
3 -0.339224 -0.468766  0.548478  1.797544
4 -0.729877  1.671328 -0.461578 -2.218269
5  0.678028 -0.369176 -2.221671  1.862874
6 -0.605162  1.152462 -0.929371 -1.401879
7 -0.859455 -0.289253 -0.535201 -0.131098
8 -0.831198  0.242815 -0.149576 -0.199087
9 -0.554079 -0.695258  0.810425  0.123284


In [30]:
#More often than not, a DataFrame i loaded with the content of a CSV file.  We can read in a CSV file as follows:

In [None]:
df = pd.read_csv('data.csv')

In [None]:
#You can set the index for the DataFrame as follows:

In [None]:
df = pd.read_csv('data.csv')
days = pd.date_range('20190525', periods = 10)
df.index = days
print(df)

In [None]:
#To get the index of the DataFrame, use the index property as follows:

In [None]:
print(df.index)

In [None]:
#If you want to get the values of the entire DataFrame as a two-dimensional ndarray, use the values property, as follows:

In [None]:
print(df.values)

In [None]:
#Generating descriptive statistics on the DataFrame

#You can use describe(), as follows:

In [None]:
print(df.describe())

In [None]:
#You can compute the mean of the function, as follows:

In [None]:
print(df.mean(0)) #0 is the column.  You can replace 0 with any numbered column and it will give you the result.

In [None]:
#Extracting from DataFrames

In [None]:
#Slicing based on row number.  If you wis hto extract specific rows and columns in a dataframe, you need to use the iloc indexer.  
#The following code snippet extracts rwo numbers 2 and 3, and column numbers 1 to 3:

In [None]:
print(df.iloc[2:4, 1:4]

In [None]:
#Slicing based on Labels
#Besides extracting rows and columns using their row and column numbers, you can also extract them by label.  
#For example, the following code snippet extracts a range of rows using their index values:

In [None]:
print(df.lo['20190601' : '20190601', 'A':'C']

In [None]:
#Oddly, if you want to extract specific rows with datetime as the index, you cannot simply pass the date value to the loc indexer.
#First you need to convert the date into a datetime format, as follows:

In [None]:
from datetime import datetime
date1 = datetime(2019, 6, 1, 0, 0, 0)
date2 = datetime(2019, 6, 3, 0, 0, 0)
print(df.loc[[date1,date2]])

In [34]:
#If you want a specific row and specific columns, you can extract them as follows:

In [None]:
print(df.loc[date1, ['A','C']])

In [36]:
#Selecting a Single Cell in a DataFrame

#If you simply wish to access a single cell in a dataframe, there is a function that does just that at ().
#Using the same example as in the previous section, if you want to get the value of a specific cell, you can use the following code snippet:

In [None]:
from datetime import datetime
d = datetime(2016, 6, 3, 0, 0, 0)
print(df.at[d,'B'])

In [None]:
#Selecting based on Cell value

print(df[(df.A > 0) & (df.B > 0)])

In [None]:
#Transforming DataFrames.

#If you need to reflect the dataframe over its main diagonal, you can use the transpose() function as follows:

print(df.transpose())

#Alternatively, you can use the T property, which is an accessor o the transpose() function, as follows:

print(df.T)

In [None]:
#Checking to see if a result is a dataframe or series

#One of the common problems that you will face with Pandas is knowing if the result that you have obtained is a series or a dataframe.  
#To solve this mystery, here is a function that can make your life easier:

der checkSeriesOrDataframe(var) :
    if isinstance(var, pd.DataFrame):
        return 'DataFrame'
    if isinstance(var, pd.Series) :
        return 'Series'

In [None]:
#Sorting Data in a DataFrame

#There are two ways that you can sort the data in a DataFrame.
#1) sort labels using the sort_index() function
#2) sort by value using the sort_values() function

#Sorting using index as follows:

print(df.sort_index(axis=0, ascending=False)
      
#Setting asix = 1 means that you're sorting by common labels.
      
#Sorting by value as follows:
      
print(df.sort_values('A', asix = 0)

#To sort based on a particular index, set the asix parameter to 1.

In [None]:
#Applying Functions to a DataFrame.

#You can also apply functions to values in a DataFrame using the apply() function.  
#First, let's define the two lambda functions as follows:

import math
sq_root = lambda x: math.sqrt(x) if x > 0 else X
sq      = lambda x: x**2

#We can now apply the functions to the DataFrame, as follows:

print(df.B.apply(sq_root)

      #or
      
print(df.B.apply(sq))
      
      
#If you want to apply the square root function to the entire DataFrame, you can iterate through the columns and pply the function to each column, as follows:
      
      
for column in df:
      df[column] = df[column].apply(sq_root)
print(df)

In [None]:
#Generating a Crosstab

#In statistics, aa crosstab is used to aggregate and jointly display the distribution of two or more variables.  It shows the relationships between these variables.
#Consider the followin

df = pd.DataFrame(
    {
        "Gender": ['Male', 'Male', 'Female', 'Female', 'Female'] ,
        "Team"  : [1, 2, 3, 4, 5]
    })
print(df)

#Here you're creating a DataFrame using a dictionary.