# NumPy

In [324]:
import numpy as np

In [325]:
my_List1= [1,2,3,4,5,6]
my_List2= [5,2,8,4,7,9]
array1= np.array(my_List1)
array2= np.array(my_List2)
print(array1)
print(array2)

[1 2 3 4 5 6]
[5 2 8 4 7 9]


### For NumPy arrays (having compatible shapes and data types), the '+' operator can perform element-wise addition.

In [326]:
print(array1+array2) 

[ 6  4 11  8 12 15]


### Element-wise scalar addition

In [327]:
print(array1+10)

[11 12 13 14 15 16]


### To get the dimension of array

In [328]:
array1.ndim # Dimension

1

In [329]:
import numpy as np

# 1-dimensional array
array1 = np.array([1, 2, 3])
print(array1.ndim)  # Output: 1

# 2-dimensional array
array2 = np.array([[1, 2], [3, 4]])
print(array2.ndim)  # Output: 2

# 3-dimensional array
array3 = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
print(array3.ndim)  # Output: 3

1
2
3


## 'shape' returns a tuple with each index having the number of corresponding elements.

In [330]:
import numpy as np
my_List1= [1,2,3,4,5,6]
array1= np.array(my_List1)
array1.shape 

(6,)

In [331]:
import numpy as np

arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

print(arr.shape) # Here shape returns (2, 4), which means that the array has 2 dimensions, where the first dimension has 2 elements and the second has 4.

(2, 4)


## Data type

In [332]:
array1.dtype #Data type

dtype('int64')

In [333]:
array3=np.array([1,3,5,7,'a']) # Different data types in an array
array3

array(['1', '3', '5', '7', 'a'], dtype='<U21')

### Syntax: numpy.size(arr, axis=None)
#### By default, it gives the total number of elements in an array. Here it is 3*3=9
#### axis (x,y,z: along which the elements(rows or columns) are counted)  is an integer value and it is optional.


 

In [334]:
array4= np.array([[1,2,3],[4,6,7],[2,8,9]])
print(array4.ndim) # Dimension
print(array4.shape) # Number of rows or column
print(array4.dtype) #Data type
print(array4.size)

2
(3, 3)
int64
9


In [335]:
array5= np.array([[1,2],[3,4]])
print(array5.ndim) # Dimension
print(array5.shape) 

2
(2, 2)


### List Vs Array
#### List: Heterogeneous, dynamic (size can change during run time)
#### Array: Homogeneous (Elements are of same data type), fixed (size defined at creation)- hence faster for numerical operations

In [336]:
a=list(range(5,10))
print(a) # There is comma to separate values
array6=np.array(a)
print(array6) # There are no commas to separate values

[5, 6, 7, 8, 9]
[5 6 7 8 9]


## Print values within a range

In [337]:
print(np.arange(1,100))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
 97 98 99]


In [338]:
print(np.arange(10,100,10)) # Syntax: np.arange(start, stop, step)

[10 20 30 40 50 60 70 80 90]


In [339]:
print(np.arange(100,0,-10))

[100  90  80  70  60  50  40  30  20  10]


## Reshape the array without changing the number of elements


In [340]:
array7=np.array([1,2,3,4,5,6]) #array7 is a 1-dimensional array with 6 elements.
print(array7)
array7.reshape(2,3) #Reshapes array7 into 2 rows and 3 columns. (Row*Column should be equal to number of elements of array7)

[1 2 3 4 5 6]


array([[1, 2, 3],
       [4, 5, 6]])

In [341]:
array8=np.arange(1,101)
print(array8)
array8.reshape(10,10)

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100]


array([[  1,   2,   3,   4,   5,   6,   7,   8,   9,  10],
       [ 11,  12,  13,  14,  15,  16,  17,  18,  19,  20],
       [ 21,  22,  23,  24,  25,  26,  27,  28,  29,  30],
       [ 31,  32,  33,  34,  35,  36,  37,  38,  39,  40],
       [ 41,  42,  43,  44,  45,  46,  47,  48,  49,  50],
       [ 51,  52,  53,  54,  55,  56,  57,  58,  59,  60],
       [ 61,  62,  63,  64,  65,  66,  67,  68,  69,  70],
       [ 71,  72,  73,  74,  75,  76,  77,  78,  79,  80],
       [ 81,  82,  83,  84,  85,  86,  87,  88,  89,  90],
       [ 91,  92,  93,  94,  95,  96,  97,  98,  99, 100]])

## Calculating the time required for operations in Python and Numpy

In [342]:
L=range(1000)
%timeit num=[i**2 for i in L] # Calculating time required for operation

85.9 μs ± 4.86 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [343]:
nums=np.arange(1000) 
%timeit nums**2

1.2 μs ± 27.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


## Creating matrices

In [344]:
np.ones((3,3)) # Array of ones

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [345]:
np.eye(3) # Identity matrix

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [346]:
np.diag([1,2,3,4])

array([[1, 0, 0, 0],
       [0, 2, 0, 0],
       [0, 0, 3, 0],
       [0, 0, 0, 4]])

## Write code for standard deviation

## Method 1:

In [347]:
x=np.array(range(1,25))
print(np. std(x))

6.922186552431729


### Method 2: Wrong answer; Correct later

In [349]:
import numpy as np
def std_Deviation(Data):
    avg = np.mean(Data)
    print("Mean of data:", avg)
    for i in Data:
        Dev=0
        diff= Data-avg
        Dev=Dev+diff
    print(Dev)
    std_Dev= np.sqrt(sum(Dev**2)/len(Data))
    return std_Dev
 print("Standard deviation:",std_Dev)
 
x=np.array(range(1,25))
std_Deviation(x)

IndentationError: unindent does not match any outer indentation level (<string>, line 12)

## np.diag: 
### 2 primary uses
#### -When you pass a 1D array to np.diag(), it creates a 2D square matrix with the elements of the input array on its main diagonal.
#### -When you pass a 2D array to np.diag(), it extracts the diagonal elements of that matrix.


In [None]:
z=np.diag([1,2,3,4])
np.diag(z)

array([1, 2, 3, 4])

In [None]:
z=np.diag([[1,10,56],[15,2,71],[81,49,3]])
np.diag(z)

array([[1, 0, 0],
       [0, 2, 0],
       [0, 0, 3]])

##  Random number generation module 
### Generates 1-D array containing three random floating-point numbers. These numbers are uniformly distributed between 0 (inclusive) and 1 (exclusive).
### The numbers are uniformly distributed, meaning that any number within the range [0, 1) is equally likely to occur.
### The values in the array will change each time you run the code.
### This function is often used in various applications, such as:
##### Generating random data for testing and simulations.
##### Initializing weights in neural networks.
##### Creating random samples for statistical analysis.

In [None]:
d=np.random.random(3)
d

array([0.62781754, 0.64385371, 0.09453855])

In [None]:
np.random.randn(4) #Increase the number to get Normal distribution.

array([-0.57309684,  0.04928236,  0.500908  , -0.48640371])

In [None]:
# 0: The lower bound (inclusive).
#20: The upper bound (exclusive).
#3: The number of random integers to generate.
np.random.randint(0,20,3) 

array([ 7, 17,  0], dtype=int32)

## Data type

In [None]:
a=np.arange(10)
a.dtype

dtype('int64')

In [None]:
a=np.arange(10, dtype=np.float64)
a.dtype

dtype('float64')

In [None]:
a1= np.array([1+3j])
print(a1.dtype)
a2=np.array([True, False, True])
print(a2.dtype)

complex128
bool


In [None]:
a3=np.array(["True","False"]) #creates an array of strings.
print(a3.dtype) # op <U5 elements are Unicode strings with a maximum length of 5 characters.
a=np.arange(10) #creates an array of integers.
a
a[3]

<U5


np.int64(3)

# Slicing and reversing

In [None]:
a=np.arange(10)
b=a[::2] #Create array: start,stop,step
b

array([0, 2, 4, 6, 8])

In [None]:
a=np.arange(10)

a5=a[::-2]
print("Array a in descending order with step minus two:", a5)

a5a=a[1::2]
print("Start Array a from 1 with step two:", a5a)

a6=a[1::2][::-1] # [1::2]Starts Array a from '1' with step two. '[::-1]' Reverse the order of output because step size is given as negative one.
print("Sliced and reversed array a:", a6)

a7=a[1::2][::-1][::-1] # Here the third '[::-1]' reverse the order again.
print("Reversed a6:", a7)


Array a in descending order with step minus two: [9 7 5 3 1]
Start Array a from 1 with step two: [1 3 5 7 9]
Sliced and reversed array a: [9 7 5 3 1]
Reversed a6: [1 3 5 7 9]


## Checking memory mapping

In [None]:
np.shares_memory(a,a5) # checks if two arrays, a and a5, share the same underlying memory block.

True

## Creating copy without memory mapping

In [None]:
c=a[::2].copy() # creates a deep copy of the view, ensuring that c is a separate array with its own memory.Now, any modifications made to c will not affect the original array a.

## Creating zero matrix and null matrix

In [None]:
a8=np.zeros((4,4))
a9=np.ones((4,4))
print(a8)
print(a9)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [None]:
c=np.vstack((a8,a9)) #Number of rows should be same
print(c)
c.shape

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


(8, 4)

# Horizontal stack

In [None]:
d=np.hstack((a8,a9))# Number of columns should be same
d.shape
d

array([[0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.]])

In [None]:
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])
stack=np.hstack((x,y))
stack

array([1, 2, 3, 4, 5, 6])

## As you can see, np.hstack() concatenates the two arrays horizontally, creating a single array.

### Key Points:

#### Array Shapes: The arrays being stacked must have the same number of dimensions, except for 1D arrays.
#### Memory Efficiency: np.hstack() often creates a view of the input arrays, so modifying 'stack' might affect x and y if they share memory.
#### Alternative: You can also use the np.concatenate() function with the axis=1 argument to achieve the same result.

In [None]:
a8=np.zeros((4,4))
a9=np.ones((4,4))
np.concatenate([a8,a9],axis=1) # If you give axis =0 it will give v stack(vertical) and for axis=1 it will give h stack axis=3 gives 3D

array([[0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.]])

In [None]:
d.T # Gives transpose of d

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [None]:
d=np.hstack((a8,a9))
d.T.shape # Shape of Transpose 

(8, 4)

In [None]:
a10=np.arange(4)
a11=a10.T # Transpose of a10
np.dot(a10,a11) # Dot product

np.int64(14)

In [None]:
a12=np.array([[1,2,3],[4,5,6],[7,8,9]])
print(a12)
a13=a12.T
print(a13)
np.dot(a12,a13)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 4 7]
 [2 5 8]
 [3 6 9]]


array([[ 14,  32,  50],
       [ 32,  77, 122],
       [ 50, 122, 194]])

# PANDAS

## Why Pandas DataFrame is not called as table?

#### Rich Functionality: DataFrame offers a wide range of functionalities beyond basic table operations. It provides tools for data cleaning, manipulation, analysis, and visualization, making it a powerful tool for data scientists and analysts.   
#### Data Structure: While a table is a static structure, a DataFrame is a dynamic data structure that can be easily modified, reshaped, and filtered. It offers flexibility in handling diverse data formats and performing complex operations.   
#### Integration with Other Libraries: DataFrames can seamlessly integrate with other libraries like NumPy, Matplotlib, and Scikit-learn, enabling advanced data analysis and machine learning tasks.   
#### Label-Based Indexing: DataFrames have labeled axes (rows and columns), making it easier to access and manipulate specific data points.   
#### Heterogeneous Data Types: Unlike traditional tables, DataFrames can handle different data types within a single column, providing flexibility in data representation.

   


In [None]:
import pandas as pd

In [350]:
weather_data=[("1/1/2024",32,6,"Rain"),("2/11/2024",35,7,"Sunny")]
df=pd.DataFrame(weather_data,\
                columns=["date","temp","wind","climate"])
df

Unnamed: 0,date,temp,wind,climate
0,1/1/2024,32,6,Rain
1,2/11/2024,35,7,Sunny


In [351]:
df.shape # Returns number of rows and columns

(2, 4)

In [352]:
df = pd.read_csv("D:\\ICT Academy\\DataScience\\Python\\auto-mpg.csv") # Location of file 'auto-mpg.csv'.

In [353]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [354]:
df.shape # Size of table as rows and columns

(398, 9)

# Access rows/columns using index/label.

#### Accessing row:

In [356]:
df.loc[4] # Access specific rows or columns within a DataFrame based on their index/labels.

mpg                    17.0
cylinders                 8
displacement          302.0
horsepower              140
weight                 3449
acceleration           10.5
model year               70
origin                    1
car name        ford torino
Name: 4, dtype: object

### Acessing single column

In [358]:
column_data = df.loc[:, 'cylinders']
column_data

0      8
1      8
2      8
3      8
4      8
      ..
393    4
394    4
395    4
396    4
397    4
Name: cylinders, Length: 398, dtype: int64

### Acessing multiple columns

In [None]:
selected_columns = df.loc[:, ['cylinders', 'car name']]
selected_columns

Unnamed: 0,cylinders,car name
0,8,chevrolet chevelle malibu
1,8,buick skylark 320
2,8,plymouth satellite
3,8,amc rebel sst
4,8,ford torino
...,...,...
393,4,ford mustang gl
394,4,vw pickup
395,4,dodge rampage
396,4,ford ranger


### Access column and change order of columns

In [366]:
change_column_order = df.loc[:, ['car name','cylinders', 'model year']]
change_column_order

Unnamed: 0,car name,cylinders,model year
0,chevrolet chevelle malibu,8,70
1,buick skylark 320,8,70
2,plymouth satellite,8,70
3,amc rebel sst,8,70
4,ford torino,8,70
...,...,...,...
393,ford mustang gl,4,82
394,vw pickup,4,82
395,dodge rampage,4,82
396,ford ranger,4,82


## df.head(): By default, displays the first 5 rows of a DataFrame. If you provide a number within the brackets, it will display that much rows.

In [278]:
df.head() # Print first 5 rows.

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [279]:
df.head(2) # First two rows

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320


# Acess a column from table

#### Method 1:

In [280]:
df["mpg"] # To get column 'mpg'

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

#### Method 2:

In [281]:
df.mpg # To get column 'mpg'

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [282]:
df["car name"]

0      chevrolet chevelle malibu
1              buick skylark 320
2             plymouth satellite
3                  amc rebel sst
4                    ford torino
                 ...            
393              ford mustang gl
394                    vw pickup
395                dodge rampage
396                  ford ranger
397                   chevy s-10
Name: car name, Length: 398, dtype: object

## Returns labels/names of all columns:

In [283]:
df.columns #Get the name of columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [284]:
df.columns.tolist()  #Get the name of columns as a list

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model year',
 'origin',
 'car name']

## Access multiple columns:

In [285]:
df[['mpg','car name']]#Get the name of two columns

Unnamed: 0,mpg,car name
0,18.0,chevrolet chevelle malibu
1,15.0,buick skylark 320
2,18.0,plymouth satellite
3,16.0,amc rebel sst
4,17.0,ford torino
...,...,...
393,27.0,ford mustang gl
394,44.0,vw pickup
395,32.0,dodge rampage
396,28.0,ford ranger


# DataFrame to Excel conversion: 
### This code will create an Excel file named 'output.xlsx' in the same directory as your Python script. You can then open this file on your computer.
### Here, the directory will be "D:\\ICT Academy\\DataScience\\Python"

In [286]:
# If you are using VS code you have to install 'openpyxl' before using to_excel() function. To install it use command 'pip install openpyxl' in terminal.
new_df=df[['car name','weight']]#Get the content of two columns
new_df.to_excel("output.xlsx", sheet_name="sheet1")

# Change order of column 

In [287]:
new_df1=df[['weight','car name']] #Get the content of two columns in the order 'weight', 'car name'
new_df1
new_df1.to_excel("output.xlsx", sheet_name="sheet1") # Order of columns in the output excel sheet will change.(If excel file is open while executing code, it will result in error: permission denied)

# df.describe() :  Provides summary statistics of a DataFrame. It calculates various descriptive statistics for each numerical column, giving you a quick overview of the data's central tendency, dispersion, and shape.

In [288]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


## df.info(): Provides a summary of the DataFrame's structure and characteristics, including:
### General Information: This typically shows the number of rows and columns in the DataFrame.
### Column Information: For each column, it displays:
### Column name
### Non-null values (count)
### Data type
### Memory usage

In [289]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


# Read excel file

In [290]:
df_excel = pd.read_excel("D:\\ICT Academy\\DataScience\\Python\\output.xlsx", sheet_name='sheet1') # read Sheet1 of the excel file 'output.xlsx'.


print(df_excel.head()) # Print the first 5 rows

   Unnamed: 0  weight                   car name
0           0    3504  chevrolet chevelle malibu
1           1    3693          buick skylark 320
2           2    3436         plymouth satellite
3           3    3433              amc rebel sst
4           4    3449                ford torino


# To get number of rows and columns of dataframe:

In [291]:
df.shape 

(398, 9)

## Grouping and aggregating data within DataFrames.

## Key Steps in GroupBy Operations:

#### Splitting: The DataFrame is divided into smaller DataFrames based on the specified grouping criteria.
#### Applying: A function is applied to each group to perform calculations or transformations.
#### Combining: The results from each group are combined to form a new DataFrame or Series.
### Common Use Cases:

#### Aggregation: Calculating summary statistics for each group (e.g., mean, sum, count).
#### Transformation: Applying functions to each group (e.g., normalization, scaling).
#### Filtering: Selecting specific groups based on certain conditions.

In [292]:
import pandas as pd

# Sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 28, 35],
        'City': ['New York', 'Los Angeles', 'New York', 'Los Angeles']}
df1 = pd.DataFrame(data)

# Group by 'City'
g1 = df1.groupby('City') # Intermediate representation of the grouping operation. Not printable.

# Calculate the mean age for each city
print(g1['Age'].mean())
#g.transform(lambda x: x - x.mean())  # Subtract the mean from each value in a group. Here we can't apply this as it can't convert string (Bob, David) to numeric value
#print(g.transform(lambda x: x - x.mean()))  # Subtract the mean from each value in a group. Here we can't apply this as it can't convert string (Bob, David) to numeric value
print(g1.sum())   # Calculate the sum of each group
print(g1.count()) # Count the number of rows in each group


City
Los Angeles    32.5
New York       26.5
Name: Age, dtype: float64
                     Name  Age
City                          
Los Angeles      BobDavid   65
New York     AliceCharlie   53
             Name  Age
City                  
Los Angeles     2    2
New York        2    2


## Random Selection: The rows are selected randomly, ensuring a fair representation of the data.
##### Sample Size: The number 10 indicates the desired sample size. You can adjust this number to get different sample sizes.
##### New DataFrame: The sample() method returns a new DataFrame containing the sampled rows.
##### No Replacement: By default, the sampling is done without replacement, meaning a row can only be selected once. (replace=True will activate replacement)
##### frac parameter: df.sample(frac=0.5) will sample 50% of the rows.

In [293]:
df_sample=df.sample(10)
print(df_sample)

      mpg  cylinders  displacement horsepower  weight  acceleration  \
5    15.0          8         429.0        198    4341          10.0   
47   19.0          6         250.0        100    3282          15.0   
253  20.5          6         200.0         95    3155          18.2   
251  20.2          8         302.0        139    3570          12.8   
150  26.0          4         108.0         93    2391          15.5   
137  13.0          8         350.0        150    4699          14.5   
12   15.0          8         400.0        150    3761           9.5   
164  21.0          6         231.0        110    3039          15.0   
107  18.0          6         232.0        100    2789          15.0   
187  17.5          8         305.0        140    4215          13.0   

     model year  origin                           car name  
5            70       1                   ford galaxie 500  
47           71       1                   pontiac firebird  
253          78       1            

In [294]:
df_sample.shape

(10, 9)

In [295]:
df_sample1=df.sample(frac=0.2) #random 20% shufled datat
print(df_sample1)

      mpg  cylinders  displacement horsepower  weight  acceleration  \
101  23.0          6         198.0         95    2904          16.0   
75   14.0          8         318.0        150    4077          14.0   
356  32.4          4         108.0         75    2350          16.8   
239  30.0          4          97.0         67    1985          16.4   
372  27.0          4         151.0         90    2735          18.0   
..    ...        ...           ...        ...     ...           ...   
196  24.5          4          98.0         60    2164          22.1   
116  16.0          8         400.0        230    4278           9.5   
387  38.0          6         262.0         85    3015          17.0   
268  27.2          4         119.0         97    2300          14.7   
357  32.9          4         119.0        100    2615          14.8   

     model year  origin                           car name  
101          73       1                    plymouth duster  
75           72       1  

# Groupby: Example 1

In [309]:
df_sampleData = pd.read_excel("D:\\ICT Academy\\DataScience\\Python\\Sample_Data.xlsx", sheet_name='Sheet1') # Location of file 'Sample_Data.xlsx'.
display(df_sampleData) 
g2= df_sampleData.groupby("Model Year")
print(g2.max()) # shows the maximum price and mileage for each model year.

Unnamed: 0,Model Year,Price,Mileage
0,2023,20000,10000
1,2023,25000,8000
2,2024,30000,5000
3,2024,35000,7000


            Price  Mileage
Model Year                
2023        25000    10000
2024        35000     7000


In [311]:
# x represents the group name (in this case, the "Model Year"), and model_df1 is a DataFrame containing the rows belonging to that group.
for x, model_df1 in g2: # Iterates over each group in the GroupBy object g. 
    print(x) # Prints the current group name (e.g., 2023, 2024).
    print(model_df1) # Prints the DataFrame containing the rows for the current group.

2023
   Model Year  Price  Mileage
0        2023  20000    10000
1        2023  25000     8000
2024
   Model Year  Price  Mileage
2        2024  30000     5000
3        2024  35000     7000


# Groupby: Example 1

In [312]:
#df
g3= df.groupby("model year")
print(g3.max())

             mpg  cylinders  displacement horsepower  weight  acceleration  \
model year                                                                   
70          27.0          8         455.0         97    4732          20.5   
71          35.0          8         400.0          ?    5140          20.5   
72          28.0          8         429.0         97    4633          23.5   
73          29.0          8         455.0         95    4997          21.0   
74          32.0          8         350.0          ?    4699          21.0   
75          33.0          8         400.0         98    4668          21.0   
76          33.0          8         351.0         95    4380          22.2   
77          36.0          8         400.0         98    4335          19.0   
78          43.1          8         318.0         97    4080          21.5   
79          37.3          8         360.0         90    4360          24.8   
80          46.6          6         225.0          ?    3381    

In [313]:
for x, model_df2 in g3:
    print(x)
    print(model_df2)

70
     mpg  cylinders  displacement horsepower  weight  acceleration  \
0   18.0          8         307.0        130    3504          12.0   
1   15.0          8         350.0        165    3693          11.5   
2   18.0          8         318.0        150    3436          11.0   
3   16.0          8         304.0        150    3433          12.0   
4   17.0          8         302.0        140    3449          10.5   
5   15.0          8         429.0        198    4341          10.0   
6   14.0          8         454.0        220    4354           9.0   
7   14.0          8         440.0        215    4312           8.5   
8   14.0          8         455.0        225    4425          10.0   
9   15.0          8         390.0        190    3850           8.5   
10  15.0          8         383.0        170    3563          10.0   
11  14.0          8         340.0        160    3609           8.0   
12  15.0          8         400.0        150    3761           9.5   
13  14.0         

# Concatenation

In [None]:
us_weather= pd.DataFrame({"City":["NY","Chicago","Seatle"],
                          "Temperature":[21,14,35],
                         "Humidity": [68,65,75]})
india_weather= pd.DataFrame({"City":["Mumbai","Delhi","Trivandrum"],
                              "Temperature":[30,21,22],
                         "Humidity":[54,75,35]})
print(us_weather)
print(india_weather)
weather_us_india=pd.concat([us_weather,india_weather],\
                           ignore_index=True) # Combine two data frames #Ignore index avoids the repetition of indices.
print(weather_us_india)

      City  Temperature  Humidity
0       NY           21        68
1  Chicago           14        65
2   Seatle           35        75
         City  Temperature  Humidity
0      Mumbai           30        54
1       Delhi           21        75
2  Trivandrum           22        35
         City  Temperature  Humidity
0          NY           21        68
1     Chicago           14        65
2      Seatle           35        75
3      Mumbai           30        54
4       Delhi           21        75
5  Trivandrum           22        35


In [None]:
weather_us_india["Temperature"]<30 # Result as boolean values. Acts as a mask to filter the DataFrame.

0     True
1     True
2    False
3    False
4     True
5     True
Name: Temperature, dtype: bool

In [318]:
weather_us_india[weather_us_india["Temperature"]<30]  # Filters the DataFrame based on the boolean mask created in the previous step.

Unnamed: 0,City,Temperature,Humidity
0,NY,21,68
1,Chicago,14,65
4,Delhi,21,75
5,Trivandrum,22,35


### Column-wise concatenation (axis=1): columns of the two DataFrames will be combined side-by-side.

In [319]:
weather_us_india=pd.concat([us_weather,india_weather],\
                           ignore_index=True,axis=1) # axis=1:Column-wise concatenation (columns of the two DataFrames will be combined side-by-side.)
print(weather_us_india)

         0   1   2           3   4   5
0       NY  21  68      Mumbai  30  54
1  Chicago  14  65       Delhi  21  75
2   Seatle  35  75  Trivandrum  22  35


## Matching and Merging: 
#### The merge() function finds rows with matching "Temperature" values in both DataFrames.
#### Merging: It combines the rows with matching "Temperature" values into a single row in the resulting DataFrame.

In [None]:
pd.merge(india_weather,us_weather, on="Temperature") #Shows places that have same temperature

Unnamed: 0,City_x,Temperature,Humidity_x,City_y,Humidity_y
0,Delhi,21,75,NY,68


## Custom indexing/ Label based indexing

In [321]:
df=pd.DataFrame([1,2,3,4], index=[48,43,21,45])# creates a DataFrame with a single column of values [1, 2, 3, 4] and a custom index [48, 43, 21, 45].
df.loc[45] # Accesses the row with the index label 45.
# Question: Why output still shows 0 as index?

0    4
Name: 45, dtype: int64

# Series : One-dimensional array-like object that can hold various data types. Often used to represent a single column of data.

### Creating series

In [367]:
# Create a Series from a list
s1 = pd.Series([1, 2, 3, 4, 5])

# Create a Series with a custom index
s2 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])

# Create a Series from a dictionary
s3 = pd.Series({'A': 100, 'B': 200, 'C': 300})

### Accessing elements of series

In [368]:
# Accessing by index position
print(s1[0])  # Output: 1

# Accessing by index label
print(s2['b'])  # Output: 20

# Slicing
print(s1[1:4])  # Output: 2    2
                # 3    3
                # 4    4
                # dtype: int64

1
20
1    2
2    3
3    4
dtype: int64


### Operartions on series

In [370]:
# Arithmetic operations
print(s1 + 2)


0    3
1    4
2    5
3    6
4    7
dtype: int64


In [371]:
# Statistical calculations
print(s1.mean())
print(s1.std())

3.0
1.5811388300841898


In [373]:
# Filtering
print(s1[s1 > 2])

2    3
3    4
4    5
dtype: int64


In [374]:
# Sorting
print(s1.sort_values())

0    1
1    2
2    3
3    4
4    5
dtype: int64


# Missing value

In [379]:
city1=pd.Series([12,np.nan,7,9],index=["day1","day2","day3","day4"])
print(city1)

day1    12.0
day2     NaN
day3     7.0
day4     9.0
dtype: float64


In [381]:
city1.isna() # Returns a Series with True for missing values and False for non-missing values.

day1    False
day2     True
day3    False
day4    False
dtype: bool

# Calculate the sum of the non-null values

In [384]:
y=pd.Series([7,np.nan,5,np.nan],index=["day1","day2","day3","day4"])
print(y)
y.sum ()

day1    7.0
day2    NaN
day3    5.0
day4    NaN
dtype: float64


np.float64(12.0)

### Remove missing values:

##### The ~ operator inverts the Boolean Series. So, it becomes True for non-missing values and False for missing values.

##### city1[~city1.isna()]: This part uses Boolean indexing to select only the elements from city1 where the corresponding value in the inverted Boolean Series is True. In other words, it filters out the missing values.

In [385]:
city1[~city1.isna()]

day1    12.0
day3     7.0
day4     9.0
dtype: float64

In [495]:
df=df = pd.read_csv("D:\\ICT Academy\\DataScience\\VS\\auto-mpg.csv")
df["car name"].unique()

array(['chevrolet chevelle malibu', 'buick skylark 320',
       'plymouth satellite', 'amc rebel sst', 'ford torino',
       'ford galaxie 500', 'chevrolet impala', 'plymouth fury iii',
       'pontiac catalina', 'amc ambassador dpl', 'dodge challenger se',
       "plymouth 'cuda 340", 'chevrolet monte carlo',
       'buick estate wagon (sw)', 'toyota corona mark ii',
       'plymouth duster', 'amc hornet', 'ford maverick', 'datsun pl510',
       'volkswagen 1131 deluxe sedan', 'peugeot 504', 'audi 100 ls',
       'saab 99e', 'bmw 2002', 'amc gremlin', 'ford f250', 'chevy c20',
       'dodge d200', 'hi 1200d', 'chevrolet vega 2300', 'toyota corona',
       'ford pinto', 'plymouth satellite custom', 'ford torino 500',
       'amc matador', 'pontiac catalina brougham', 'dodge monaco (sw)',
       'ford country squire (sw)', 'pontiac safari (sw)',
       'amc hornet sportabout (sw)', 'chevrolet vega (sw)',
       'pontiac firebird', 'ford mustang', 'mercury capri 2000',
       'opel 1900'

##  Count the occurrences of unique values within a series/column:

In [499]:
df = pd.read_csv("D:\\ICT Academy\\DataScience\\VS\\auto-mpg.csv")
df["car name"].value_counts() # Classification of classes

car name
ford pinto             6
toyota corolla         5
amc matador            5
ford maverick          5
chevrolet chevette     4
                      ..
chevrolet monza 2+2    1
ford mustang ii        1
pontiac astro          1
amc pacer              1
chevy s-10             1
Name: count, Length: 305, dtype: int64

## Count the occurance of unique values within a row:

#### Method 1: Using set()

In [387]:
row_data = df.iloc[0]  # Assuming you want to analyze the first row
unique_values = set(row_data)
count_unique_values = len(unique_values)
count_unique_values

9

#### Method 2: Using value_counts() on a Transposed DataFrame

In [390]:
row_counts = df.transpose().iloc[4].value_counts()
row_counts

weight
1985    4
2130    4
2155    3
2265    3
2300    3
       ..
4376    1
4382    1
4732    1
2264    1
3449    1
Name: count, Length: 351, dtype: int64

## Remove specific rows or columns from a DataFrame.
#### labels: A single label or a list-like object of labels to drop.
#### axis: Specifies whether to drop labels from the index (0 or 'index') or columns (1 or 'columns').
#### inplace: If True, modifies the DataFrame in-place, otherwise returns a new DataFrame with the dropped labels.
#### errors: Controls how errors are handled. 'ignore' suppresses errors, while 'raise' (default) raises a KeyError for missing labels.

##### Drop column:

In [395]:
car_new=df.drop(["origin","car name"],axis=1) # Drop columns origin and car name
car_new

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
0,18.0,8,307.0,130,3504,12.0,70
1,15.0,8,350.0,165,3693,11.5,70
2,18.0,8,318.0,150,3436,11.0,70
3,16.0,8,304.0,150,3433,12.0,70
4,17.0,8,302.0,140,3449,10.5,70
...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82
394,44.0,4,97.0,52,2130,24.6,82
395,32.0,4,135.0,84,2295,11.6,82
396,28.0,4,120.0,79,2625,18.6,82


##### Drop row

In [399]:
df.drop(index=[3, 4])  # Drop rows with indices 0 and 2

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


## Generate descriptive statistics of a DataFrame. It provides a summary of the central tendency, dispersion, and shape of  a dataset's distribution, excluding NaN values.   


In [400]:
car_new.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year
count,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,5140.0,24.8,82.0
