In [2]:
import pandas as pd

#building a dataframe
#1. from dictionary
data = {
    'city':['Brooklyn','Seoul','Barcelona','Mexico City'],
    'country':['US','South Korea','Spain','Mexico'],
    'population':[2646000,9411000,1636000,9209944]

}

df=pd.DataFrame(data)
df

Unnamed: 0,city,country,population
0,Brooklyn,US,2646000
1,Seoul,South Korea,9411000
2,Barcelona,Spain,1636000
3,Mexico City,Mexico,9209944


In [3]:
#creating a dataframe out of list of lists
data=[
    ['Brooklyn','US',2646000],
    ['Seoul','South Korea',9411000],
    ['Barcelona','Spain',1636000],
    ['Mexico City','Mexico',9209944]
]

df=pd.DataFrame(data, columns=['City','Country','Population'])
df

Unnamed: 0,City,Country,Population
0,Brooklyn,US,2646000
1,Seoul,South Korea,9411000
2,Barcelona,Spain,1636000
3,Mexico City,Mexico,9209944


#3. **import the data from csv:**
```df = pd.read_csv('my_filename.csv')```

**Sometimes, data comes in .tsv (Tab-Separated Values) format instead of .csv.**

To account for this, we just need to add a delimiter parameter:
    
```df = pd.read_csv('my_filename.tsv', delimiter='\t')```

First thing first, import pandas at the top of the file.

Then, create a DataFrame named contacts containing information about your friends, family, or fictional characters. Your DataFrame should have at least 3 columns and 4 rows.

Feel free to be creative about what columns to include. If you need inspiration, you could consider columns like name, age, phone_number, astrological_sign. 💫

For example:

| name  | age | phone_number | astrological_sign |
|-------|-----|--------------|-------------------|
| Bart  | 10  | 939-555-0113 | Taurus            |
| Lisa  | 8   | 939-555-0114 | Virgo             |
| Homer | 39  | 939-555-0115 | Taurus            |
| Marge | 36  | 939-555-0116 | Pisces            |

Create the DataFrame using either the dictionary method or the list-of-lists method.

Don't forget to display the table after creating it!


In [4]:
#using dictionary to create the dataframe
astrological_data={
    'name':['Bart','Lisa','Homer','Marge'],
    'age':[10,8,39,36],
    'phone_number':['939-555-0113','939-555-0114','939-555-0115','939-555-0116'],
    'astrological_sign':['Taurus','Virgo','Taurus','Pisces']
}

df=pd.DataFrame(astrological_data)
df

Unnamed: 0,name,age,phone_number,astrological_sign
0,Bart,10,939-555-0113,Taurus
1,Lisa,8,939-555-0114,Virgo
2,Homer,39,939-555-0115,Taurus
3,Marge,36,939-555-0116,Pisces


In [5]:
#using list to create the dataframe
astrological_data=[
    ['Bart',10,'939-555-0113','Tuarus'],
    ['Lisa',8,'939-555-0114','Virgo'],
    ['Homer',39,'939-555-0115','Taurus'],
    ['Marge',36,'939-555-0116','Pisces']
]

df=pd.DataFrame(astrological_data, columns=['name','age','phone_number','astrological_sign'])
df

Unnamed: 0,name,age,phone_number,astrological_sign
0,Bart,10,939-555-0113,Tuarus
1,Lisa,8,939-555-0114,Virgo
2,Homer,39,939-555-0115,Taurus
3,Marge,36,939-555-0116,Pisces


### Data Exploration
Pandas is great for working with datasets containing thousands of rows. But when you're staring at a massive table, it can be hard to know where to begin.

Here are four pandas methods that can help with some basic data exploration:

.head()
.tail()
.info()
.describe()

If your dataset is huge, printing the whole thing would flood your screen with way too much info. This is where .head() and .tail() come in. These methods display the first 5 and last 5 rows (by default) of the DataFrame, respectively.

### Pandas Data Exploration Methods

We’ll use this sample dataset:

```python
import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva", "Frank"],
    "Age": [25, 30, 35, 40, 22, 28],
    "City": ["NY", "LA", "Chicago", "Houston", "Boston", "Seattle"]
}

df = pd.DataFrame(data)

print(df.head())

#output
|   | Name    | Age | City    |
| - | ------- | --- | ------- |
| 0 | Alice   | 25  | NY      |
| 1 | Bob     | 30  | LA      |
| 2 | Charlie | 35  | Chicago |
| 3 | David   | 40  | Houston |
| 4 | Eva     | 22  | Boston  |

print(df.tail())

#output
|   | Name    | Age | City    |
| - | ------- | --- | ------- |
| 1 | Bob     | 30  | LA      |
| 2 | Charlie | 35  | Chicago |
| 3 | David   | 40  | Houston |
| 4 | Eva     | 22  | Boston  |
| 5 | Frank   | 28  | Seattle |

print(df.info())

#output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    6 non-null      object
 1   Age     6 non-null      int64 
 2   City    6 non-null      object
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes

print(df.describe())

#output
|       | Age    |
| ----- | ------ |
| count | 6.000  |
| mean  | 30.000 |
| std   | 6.782  |
| min   | 22.000 |
| 25%   | 26.250 |
| 50%   | 29.000 |
| 75%   | 33.750 |
| max   | 40.000 |
**`df.describe(include="all")`** (which shows categorical stats too)

In [6]:
app_data = {
  'app_name': [
    'YouTube', 'TikTok', 'Instagram', 'Spotify', 'Duolingo', 
    'Twitter', 'Headspace', 'Discord', 'Depop'
  ],
  'category': [
    'Video', 'Social Media', 'Social Media', 'Music', 'Education',
    'Social Media', 'Health', 'Communication', 'Shopping'
  ],
  'rating': [
    4.7, 4.6, 4.5, 4.6, 4.7,
    4.3, None, 4.7, 4.4
  ],
  'downloads_millions': [
    5000, 3000, 3500, 2000, None,
    1500, 500, 600, 200
  ]
}
df=pd.DataFrame(app_data)
df

Unnamed: 0,app_name,category,rating,downloads_millions
0,YouTube,Video,4.7,5000.0
1,TikTok,Social Media,4.6,3000.0
2,Instagram,Social Media,4.5,3500.0
3,Spotify,Music,4.6,2000.0
4,Duolingo,Education,4.7,
5,Twitter,Social Media,4.3,1500.0
6,Headspace,Health,,500.0
7,Discord,Communication,4.7,600.0
8,Depop,Shopping,4.4,200.0


In [8]:
print("First 5 values for the dataset are: \n",df.head())
print("Last 5 values for the dataset are: \n",df.tail())

First 5 values for the dataset are: 
     app_name      category  rating  downloads_millions
0    YouTube         Video     4.7              5000.0
1     TikTok  Social Media     4.6              3000.0
2  Instagram  Social Media     4.5              3500.0
3    Spotify         Music     4.6              2000.0
4   Duolingo     Education     4.7                 NaN
Last 5 values for the dataset are: 
     app_name       category  rating  downloads_millions
4   Duolingo      Education     4.7                 NaN
5    Twitter   Social Media     4.3              1500.0
6  Headspace         Health     NaN               500.0
7    Discord  Communication     4.7               600.0
8      Depop       Shopping     4.4               200.0


In [9]:
#using .info() to get idea about any missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   app_name            9 non-null      object 
 1   category            9 non-null      object 
 2   rating              8 non-null      float64
 3   downloads_millions  8 non-null      float64
dtypes: float64(2), object(2)
memory usage: 416.0+ bytes


### **we can see that there are missing values in rating and download_million column**

In [None]:
#using .describe() to check the average number of downloads
desc_stats=df.describe()
mean_value=desc_stats.iloc[1,1]
print("Average number of downloads are:",mean_value)

Average numver of downloads are: 2037.5
