In [None]:
import numpy as np

## Week 2 Numpy: Slicing and indexing trees

**Problem statement**

Imagine you are a researcher working with data from New York City's tree census. Each row of the tree_census 2D array lists information for a different tree: the tree ID, block ID, trunk diameter, and stump diameter in that order. Living trees do not have stump diameters, which explains why there are so many zeros in that column. Column order is important because NumPy does not have column names! The first and last three rows of tree_census are shown below.





In this exercise, you'll be working specifically with the second column, representing block IDs: your research requires you to select specific city blocks for further analysis using NumPy slicing and indexing. numpy is loaded as np, and the tree_census 2D array is input.


**TASK**

**a)**Select all rows of data from the second column, representing block IDs; save the resulting array as block_ids.
Print the first five block IDs from block_ids.


**b)** Select five consecutive block IDs from block_ids, starting with the tenth ID, and save as block_id_slice






In [9]:
import numpy as np

# Creating the complete tree_census array
tree_census = np.array([
    [3, 501451, 24, 0],
    [4, 501451, 20, 0],
    [7, 501911, 3, 0],
    [11, 502321, 15, 0],
    [15, 502321, 18, 0],
    [21, 502211, 8, 0],
    [27, 502211, 9, 0],
    [32, 501461, 21, 0],
    [35, 501461, 16, 0],
    [39, 502221, 7, 0],
    [42, 502221, 5, 0],
    [50, 502101, 12, 0],
    [56, 502101, 14, 0],
    [62, 502101, 10, 0],
    [68, 501431, 19, 0],
    [75, 501431, 22, 0],
    [82, 501441, 13, 0],
    [89, 501441, 17, 0],
    [97, 502351, 23, 0],
    [105, 502351, 28, 0],
    [112, 502351, 26, 0],
    [121, 501911, 11, 0],
    [130, 501911, 7, 0],
    [139, 501911, 9, 0],
    [148, 501951, 8, 0],
    [157, 501951, 10, 0],
    [166, 501951, 12, 0],
    [175, 501461, 14, 0],
    [184, 501461, 20, 0],
    [194, 501231, 16, 0],
    [204, 501231, 19, 0],
    [215, 501231, 13, 0],
    [226, 501321, 11, 0],
    [237, 501321, 17, 0],
    [249, 501321, 18, 0],
    [261, 501411, 23, 0],
    [274, 501411, 15, 0],
    [287, 501411, 9, 0],
    [300, 501581, 10, 0],
    [313, 501581, 7, 0],
    [326, 501581, 12, 0],
    [340, 501581, 16, 0],
    [354, 501581, 14, 0],
    [368, 501151, 19, 0],
    [383, 501151, 21, 0],
    [398, 501151, 8, 0],
    [414, 501231, 6, 0],
    [430, 501231, 11, 0],
    [446, 501231, 13, 0],
    [462, 501231, 10, 0],
    [479, 501231, 22, 0],
    [496, 501221, 17, 0],
    [513, 501221, 13, 0],
    [530, 501221, 6, 0],
    [547, 501221, 5, 0],
    [564, 501221, 11, 0],
    [582, 501221, 15, 0],
    [600, 501221, 8, 0],
    [618, 501221, 10, 0],
    [636, 501221, 7, 0],
    [655, 501221, 9, 0],
    [674, 501221, 14, 0],
    [693, 501221, 12, 0],
    [712, 501221, 9, 0],
    [732, 501221, 7, 0],
    [752, 501221, 10, 0],
    [772, 501221, 6, 0],
    [792, 501221, 11, 0],
    [812, 501221, 13, 0],
    [832, 501221, 12, 0],
    [853, 501221, 8, 0],
    [874, 501221, 16, 0],
    [895, 501221, 9, 0],
    [917, 501221, 5, 0],
    [939, 501221, 6, 0],
    [961, 501221, 11, 0],
    [983, 501221, 15, 0],
    [1005, 501221, 7, 0],
    [1028, 501221, 12, 0],
    [1051, 501221, 16, 0],
    [1074, 501221, 8, 0],
    [1097, 501221, 5, 0],
    [1120, 501221, 10, 0],
    [1144, 501221, 13, 0],
    [1168, 501221, 14, 0],
    [1192, 501221, 9, 0],
    [1217, 501221, 7, 0],
    [1242, 501221, 11, 0],
    [1267, 501221, 15, 0],
    [1292, 501221, 12, 0],
    [1317, 501221, 9, 0],
    [1343, 501221, 6, 0],
    [1369, 501221, 8, 0],
    [1395, 501221, 10, 0],
    [1421, 501221, 5, 0],
    [1448, 501221, 7, 0],
    [1475, 501221, 11, 0],
    [1502, 501221, 13, 0],
    [1529, 501221, 14, 0],
    [1556, 501221, 12, 0],
    [1584, 501221, 9, 0],
    [1612, 501221, 6, 0],
    [1640, 501221, 10, 0],
    [1668, 501221, 8, 0],
    [1696, 501221, 15, 0],
    [1725, 501221, 7, 0],
    [1754, 501221, 5, 0],
    [1783, 501221, 11, 0],
    [1812, 501221, 13, 0],
    [1841, 501221, 9, 0],
    ])


In [None]:
block_ids = tree_census[:, 1]
print("a) First five block IDs:", block_ids[:5])


a) First five block IDs: [501451 501451 501911 502321 502321]


In [None]:
start_index = 9
block_id_slice = block_ids[start_index : start_index + 5]
print("b) Block ID slice:", block_id_slice)

b) Block ID slice: [502221 502221 502101 502101 502101]


In [13]:
# Select all rows of block ID data from the second column
block_ids = tree_census[:, 1]

# Select the tenth block ID from block_ids
tenth_block_id = block_ids[8:9]
print(tenth_block_id)

[501461]


In [11]:
# Select all rows of block ID data from the second column
block_ids = tree_census[:, 1]

# Select five block IDs from block_ids starting with the tenth ID
start_index = 9
block_id_slice = block_ids[start_index : start_index + 5]
print("b) Block ID slice:", block_id_slice)

b) Block ID slice: [502221 502221 502101 502101 502101]


# Pandas

In [14]:
import pandas as pd

1- In the cell below, create a DataFrame `fruits` that looks like this:

> Indented block



![](https://storage.googleapis.com/kaggle-media/learn/images/Ax3pp2A.png)

In [41]:
data = {"apple":[20], "Bananas":[23]}
data = pd.DataFrame(data)
data

Unnamed: 0,apple,Bananas
0,20,23


2.

Create a dataframe `fruit_sales` that matches the diagram below:

![](https://storage.googleapis.com/kaggle-media/learn/images/CHPn7ZF.png)

In [35]:
import pandas as pd

data = {"apple": [35, 41], "Bananas": [21, 34]}
df = pd.DataFrame(data, index=["2017 Sales", "2018 Sales"])


# Displaying the DataFrame
print(df)


            apple  Bananas
2017 Sales     35       21
2018 Sales     41       34


Create a variable ingredients with a Series that looks like:

Flour     4 cups
Milk       1 cup
Eggs     2 large
Spam       1 can

In [39]:
data = {"Flour": "4 cups", "Milk": "1 cup", "Eggs":2, "Large spam":"1 can"}
data = pd.Series(data)
data

Flour         4 cups
Milk           1 cup
Eggs               2
Large spam     1 can
dtype: object

4-
Write a Pandas program to compare the elements of the two Pandas Series.

`Sample Series: [2, 4, 6, 8, 10], [1, 3, 5, 7, 10]`

Hint: Comparison mean you cna check if they are equal, less than or greater than each other

In [50]:
Sample_Series1=  [2, 4, 6, 8, 10]
Sample_Series2=[1, 3, 5, 7, 10]
Sample_Series1=pd.Series(Sample_Series1)
Sample_Series2=pd.Series(Sample_Series2)
Sample_Series11=Sample_Series1>Sample_Series2
Sample_Series21=Sample_Series1==Sample_Series2

print(Sample_Series11)
print(Sample_Series21)

0     True
1     True
2     True
3     True
4    False
dtype: bool
0    False
1    False
2    False
3    False
4     True
dtype: bool


5-

Create a pandas series from each of the items below: a list, numpy and a dictionary

**input:**



```
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
```



In [None]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
pd.Series(mylist)

In [58]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

mylist = pd.Series(mylist)
myarr = pd.Series(myarr)
mydict = pd.Series(mydict)
print("List Series is: ",mylist)
print("Array Series is: ",myarr)
print("Dict Series is: ",mydict)



List Series is:  0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object
Array Series is:  0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
dtype: int64
Dict Series is:  a     0
b     1
c     2
e     3
d     4
f     5
g     6
h     7
i     8
j     9
k    10
l    11
m    12
n    13
o    14
p    15
q    16
r    17
s    18
t    19
u    20
v    21
w    22
x    23
y    24
z    25
dtype: int64


6-  Use pandas to import the Titanic data frame.

7-  Explore the DataFrame using methods like head(), tail(), info()


In [61]:
df = pd.read_csv("/content/test.csv")
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [60]:
df.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


## Data exploration:

8- Check the number of passengers in each passenger class using value_counts().


9- Calculate descriptive statistics for numerical variables (e.g., age, fare) using describe().


In [63]:
df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [66]:
result_df = df.groupby('Pclass')[['PassengerId']].sum().reset_index()
result_df

Unnamed: 0,Pclass,PassengerId
0,1,117510
1,2,103968
2,3,238531
