# Pandas practice

### 1. Download the dataset from the this URL 
### 2. Load it into a DataFrame

In [2]:
import pandas as pd
data = pd.read_csv("Car_Insurance.csv")

### 3. Observe the name of the columns and the corresponding data type

In [12]:
data.head(5)

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance
0,1,32,management,single,tertiary,0,1218,1,0,telephone,28,jan,2,-1,0,,13:45:20,13:46:30,0
1,2,32,blue-collar,married,primary,0,1156,1,0,,26,may,5,-1,0,,14:49:03,14:52:08,0
2,3,29,management,single,tertiary,0,637,1,0,cellular,3,jun,1,119,1,failure,16:30:24,16:36:04,1
3,4,25,student,single,primary,0,373,1,0,cellular,11,may,2,-1,0,,12:06:43,12:20:22,1
4,5,30,management,married,tertiary,0,2694,0,0,cellular,3,jun,1,-1,0,,14:35:44,14:38:56,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                4000 non-null   int64 
 1   Age               4000 non-null   int64 
 2   Job               3981 non-null   object
 3   Marital           4000 non-null   object
 4   Education         3831 non-null   object
 5   Default           4000 non-null   int64 
 6   Balance           4000 non-null   int64 
 7   HHInsurance       4000 non-null   int64 
 8   CarLoan           4000 non-null   int64 
 9   Communication     3098 non-null   object
 10  LastContactDay    4000 non-null   int64 
 11  LastContactMonth  4000 non-null   object
 12  NoOfContacts      4000 non-null   int64 
 13  DaysPassed        4000 non-null   int64 
 14  PrevAttempts      4000 non-null   int64 
 15  Outcome           958 non-null    object
 16  CallStart         4000 non-null   object
 17  CallEnd       

### 4. Check how many missing values each column has

In [5]:
missing_values = data.isnull().sum()

print(missing_values)

Id                     0
Age                    0
Job                   19
Marital                0
Education            169
Default                0
Balance                0
HHInsurance            0
CarLoan                0
Communication        902
LastContactDay         0
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             3042
CallStart              0
CallEnd                0
CarInsurance           0
dtype: int64


### 5. Drop the rows containing missing values

In [6]:
df_cleaned = data.dropna()

print(df_cleaned)

        Id  Age           Job  Marital  Education  Default  Balance  \
2        3   29    management   single   tertiary        0      637   
5        6   32    technician   single   tertiary        0     1625   
15      16   61    management   single   tertiary        0        2   
16      17   34        admin.   single  secondary        0       69   
17      18   46    management  married   tertiary        0     7331   
...    ...  ...           ...      ...        ...      ...      ...   
3987  3988   27        admin.  married   tertiary        0     2855   
3990  3991   27    technician   single  secondary        0      126   
3992  3993   34    technician  married  secondary        0        0   
3995  3996   28    technician   single   tertiary        0        0   
3998  3999   36  entrepreneur   single   tertiary        0      658   

      HHInsurance  CarLoan Communication  LastContactDay LastContactMonth  \
2               1        0      cellular               3              

### 6. For the numerical values, obtain the mean average value

In [7]:
data.describe()

Unnamed: 0,Id,Age,Default,Balance,HHInsurance,CarLoan,LastContactDay,NoOfContacts,DaysPassed,PrevAttempts,CarInsurance
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,2000.5,41.21475,0.0145,1532.93725,0.49275,0.133,15.72125,2.60725,48.7065,0.7175,0.401
std,1154.844867,11.550194,0.119555,3511.452489,0.50001,0.339617,8.425307,3.064204,106.685385,2.078647,0.490162
min,1.0,18.0,0.0,-3058.0,0.0,0.0,1.0,1.0,-1.0,0.0,0.0
25%,1000.75,32.0,0.0,111.0,0.0,0.0,8.0,1.0,-1.0,0.0,0.0
50%,2000.5,39.0,0.0,551.5,0.0,0.0,16.0,2.0,-1.0,0.0,0.0
75%,3000.25,49.0,0.0,1619.0,1.0,0.0,22.0,3.0,-1.0,0.0,1.0
max,4000.0,95.0,1.0,98417.0,1.0,1.0,31.0,43.0,854.0,58.0,1.0


### 7. Observe the Job column. How many categories can you find? What is the most frequent?

In [8]:
# Count the unique categories in the 'Job' column
unique_jobs = data['Job'].nunique()

# Find the most frequent job
most_frequent_job = data['Job'].value_counts().idxmax()
most_frequent_job_count = data['Job'].value_counts().max()

print(f"Number of unique job categories: {unique_jobs}")
print(f"Most frequent job: {most_frequent_job} with {most_frequent_job_count} occurrences")

Number of unique job categories: 11
Most frequent job: management with 893 occurrences


### 8. Create a new column with the duration of each call in seconds

In [13]:
# Convert 'CallStart' and 'CallEnd' to datetime format
data['CallStart'] = pd.to_datetime(data['CallStart'], format='%H:%M:%S')
data['CallEnd'] = pd.to_datetime(data['CallEnd'], format='%H:%M:%S')

In [14]:
data['CallStart']

0      1900-01-01 13:45:20
1      1900-01-01 14:49:03
2      1900-01-01 16:30:24
3      1900-01-01 12:06:43
4      1900-01-01 14:35:44
               ...        
3995   1900-01-01 17:46:28
3996   1900-01-01 14:49:16
3997   1900-01-01 12:19:03
3998   1900-01-01 11:27:35
3999   1900-01-01 13:31:48
Name: CallStart, Length: 4000, dtype: datetime64[ns]

In [20]:
# Calculate the duration in seconds
data['CallDurationSeconds'] = (data['CallEnd'] - data['CallStart']).dt.total_seconds()
data['CallDurationSeconds'] 

0        70.0
1       185.0
2       340.0
3       819.0
4       192.0
        ...  
3995    269.0
3996    125.0
3997    290.0
3998     99.0
3999    274.0
Name: CallDurationSeconds, Length: 4000, dtype: float64

### 9. What is the average duration of each call?

In [22]:
average_call_duration = data['CallDurationSeconds'].mean()
average_call_duration

np.float64(350.844)

In [24]:
print(f"Mean Call Duration in Seconds: {average_call_duration}")

Mean Call Duration in Seconds: 350.844


### 10. For those people contacted during the first half of the year (Jan-June). What is the most common way of communication (telephone, cellular...)?

In [26]:
# Define the months for the first half of the year
first_half_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun']

# Filter the DataFrame
df_first_half = data[data['LastContactMonth'].isin(first_half_months)]

In [27]:
# Drop rows where 'Communication' is None
df_first_half = df_first_half.dropna(subset=['Communication'])

# Find the most common way of communication
most_common_communication = df_first_half['Communication'].mode()[0]

print(f"Most common way of communication: {most_common_communication}")

Most common way of communication: cellular
