# Pandas Data Analysis in Salem Witchcraft

In [1]:
# Library
import pandas as pd

In [2]:
# Read CSV file 
accused = pd.read_csv('Accused-Witches-Data-Set.csv')
anti = pd.read_csv('Anti-Parris-Social-Data-Set.csv')
committee = pd.read_csv('Committee-List-Data-Set.csv')
committee_yearly = pd.read_csv('Committee-Yearly-Data-Set.csv')
pro = pd.read_csv('Pro-Parris-Social-Data-Set.csv')
salem = pd.read_csv('Salem-Village-Data-Set.csv')
tax = pd.read_csv('Tax-Comparison-Data-Set.csv')
towns = pd.read_csv('Towns-Data-Set.csv')

## Accused Witches Data Set

In [3]:
# Read first 5 rows
accused.head()

Unnamed: 0,Accused Witch,Residence,Month of Accusation,Month of Execution,Sort
0,"Abbott, Arthur",Ipswich,5,,1
1,"Abbott, Nehemiah, Jr.",Topsfield,4,,2
2,"Alden, John",Boston,5,,3
3,"Andrew, Daniel",Salem Village,5,,4
4,"Barker, Abigail",Andover,9,,5


In [4]:
# Read last 5 rows
accused.tail()

Unnamed: 0,Accused Witch,Residence,Month of Accusation,Month of Execution,Sort
147,"Wilds, Sarah",Topsfield,4,7.0,148
148,"Wilford, Ruth",Haverhill,8,,149
149,"Willard, John",Salem Village,5,8.0,150
150,"Wilson, Sarah Jr.",Andover,9,,151
151,"Wilson, Sarah Sr.",Andover,9,,152


In [5]:
# Rows, Columns
accused.shape

(152, 5)

In [6]:
# Summary Statistics
accused.describe()

Unnamed: 0,Month of Accusation,Month of Execution,Sort
count,152.0,20.0,152.0
mean,6.039474,8.1,76.5
std,2.752197,0.967906,44.022721
min,-1.0,6.0,1.0
25%,4.75,7.0,38.75
50%,5.0,8.0,76.5
75%,8.0,9.0,114.25
max,11.0,9.0,152.0


In [7]:
# Names of the columns in dataset
accused.columns

Index(['Accused Witch', ' Residence ', 'Month of Accusation',
       'Month of Execution', 'Sort'],
      dtype='object')

In [8]:
print('Number of accused witches by their residence:')
accused[' Residence '].value_counts()

Number of accused witches by their residence:


 Andover               45
 Salem Town            24
 Salem Village         15
 Gloucester             9
 Reading                7
 Topsfield              6
 Haverhill              6
 Rowley                 5
 Lynn                   5
 Beverly                4
 Ipswich                4
 Woburn                 3
 Billerica              3
 Boxford                3
 Charlestown            2
 Piscataqua, Maine      2
 Boston                 2
 Salisbury              1
 Marblehead             1
 Manchester             1
 Chelmsford             1
 Wells, Maine           1
 Amesbury               1
 Malden                 1
Name:  Residence , dtype: int64

The town of Andover has the most accustion; however, the last 7 residence have the least accustion. Salem Town and Salem Village is the next has the most accusation. 

In [9]:
print('Number of accused by month of excecution:')
accused['Month of Execution'].value_counts()

Number of accused by month of excecution:


9.0    9
7.0    5
8.0    5
6.0    1
Name: Month of Execution, dtype: int64

In [10]:
print('Number of accused by month:')
accused['Month of Accusation'].value_counts()

Number of accused by month:


 5     39
 9     33
 8     23
 4     22
 7     12
-1      9
 3      4
 11     3
 6      3
 2      3
 10     1
Name: Month of Accusation, dtype: int64

In [11]:
# This is the order of Months of Accusation. 
# -1 indicates that the actual month of accusation is not known
accused.groupby(["Month of Accusation"])["Month of Accusation"].count()

Month of Accusation
-1      9
 2      3
 3      4
 4     22
 5     39
 6      3
 7     12
 8     23
 9     33
 10     1
 11     3
Name: Month of Accusation, dtype: int64

In [12]:
# This is the order of Months of Execution. 
accused.groupby(["Month of Execution"])["Month of Execution"].count()

Month of Execution
6.0    1
7.0    5
8.0    5
9.0    9
Name: Month of Execution, dtype: int64

On month 9 has the most excution and in month 6 has the least excution. However, on month 5  has the most accustion. On the other hand, on month 10 has the least accustion. 

## Anti-Parris Social Data Set

In [13]:
# Read the first 5 rows 
anti.head()

Unnamed: 0,Name,Identification,Sex,Sort
0,"Porter, Joseph",Young men. 16 years old,M,1
1,"Porter, Sam:",Young men. 16 years old,M,2
2,"Preston, Jno.",Young men. 16 years old,M,3
3,"Porter, Nath.",Young men. 16 years old,M,4
4,"Swinnerton, Ben:",Young men. 16 years old,M,5


In [14]:
# Read the last 5 rows 
anti.tail()

Unnamed: 0,Name,Identification,Sex,Sort
79,"Kittel, James",Free-Holder,M,82
80,"Porter, Ben:",Free-Holder,M,80
81,"Porter, Israel",Free-Holder,M,83
82,"Small, Will:",Free-Holder,M,81
83,"Swinneron, Jasper",Free-Holder,M,84


In [15]:
# Rows, Columns
anti.shape

(84, 4)

In [16]:
# Counts for each indentification in total
anti['Identification'].value_counts()

Householder                29
[Church] Member            17
Young men. 16 years old    17
Non-Member                 16
Free-Holder                 5
Name: Identification, dtype: int64

In [17]:
# Counts for each toal sex
anti['Sex'].value_counts()

M    57
F    27
Name: Sex, dtype: int64

In [18]:
# This count for male in each indentification
print('Number of Indentification for Male:')
anti['Identification'][anti['Sex']=='M'].value_counts()

Number of Indentification for Male:


Householder                29
Young men. 16 years old    17
[Church] Member             6
Free-Holder                 5
Name: Identification, dtype: int64

In [19]:
# This count for female in each indentification
print('Number of Indentification for Female:')
anti['Identification'][anti['Sex']=='F'].value_counts()

Number of Indentification for Female:


Non-Member         16
[Church] Member    11
Name: Identification, dtype: int64

There are more male in Anti-Parris than female because many of them are young men and 16 years old based on the dataset. Also, all the males were in Householder. Majority of Householder are in Anti-AntiParris; on the other hand, the Free-Holder was the least. Most of th female are Non-Member or Church Member. 

## Committee List Data Set

In [20]:
# First 5 rows
committee.head()

Unnamed: 0,Committee Members,Petition,Social,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,Sort
0,"Putnam, Lt. John [Senr]",Pro-P,Church Member,1685.0,1686.0,1687.0,1688.0,1689.0,,,,,,,1696.0,,,1
1,"Walcott, Jonathan",Pro-P,Householder,1685.0,,,,,,,,,,,,,,2
2,"Buxton, John",Anti-P,Householder,1685.0,,,,,,,,,,,,1697.0,,3
3,"Sibley, William",NoS,Householder,1685.0,1686.0,,,,,,,,,,,,,4
4,"Putnam, Thomas Junr",Pro-P,Church Member,1685.0,1686.0,1687.0,,,,,,,1694.0,1695.0,1696.0,1697.0,1698.0,5


In [21]:
# Last 5 rows
committee.tail()

Unnamed: 0,Committee Members,Petition,Social,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,Sort
26,"Dale, John",Pro-P,Householder,,,,,,,,,,,,1696.0,,,27
27,"Wilknes, Benjamin",Pro-P,Church Member,,,,,,,,,,,,1696.0,,,28
28,"Walcott, John",Pro-P,Householder,,,,,,,,,,,,1696.0,,,29
29,"Nurs, Samuell",Anti-P,Church Member,,,,,,,,,,,,,1697.0,,30
30,"Rayment, Thomas",Anti-P,Householder,,,,,,,,,,,,,,1698.0,31


In [22]:
# Rows, Columns
committee.shape

(31, 18)

In [23]:
# Counts each total petition
committee['Petition'].value_counts()

Anti-P    15
Pro-P     13
NoS        3
Name: Petition, dtype: int64

In [24]:
# Counts each total Social 
committee['Social'].value_counts()

Householder      16
Church Member    14
Freeholder        1
Name: Social, dtype: int64

In [53]:
# Value Counts for multi columns
committee[['1685','1686','1687','1688','1690','1691','1692','1693','1694','1695','1696','1697','1698']].apply(pd.value_counts)

Unnamed: 0,1685,1686,1687,1688,1690,1691,1692,1693,1694,1695,1696,1697,1698
1685.0,5.0,,,,,,,,,,,,
1686.0,,5.0,,,,,,,,,,,
1687.0,,,5.0,,,,,,,,,,
1688.0,,,,4.0,,,,,,,,,
1690.0,,,,,5.0,,,,,,,,
1691.0,,,,,,5.0,,,,,,,
1692.0,,,,,,,5.0,,,,,,
1693.0,,,,,,,,5.0,,,,,
1694.0,,,,,,,,,5.0,,,,
1695.0,,,,,,,,,,5.0,,,


## Committee Yearly Data Set

In [None]:
# First 5 Rows
committee_yearly.head()

In [None]:
# Last 5 rows
committee_yearly.tail()

In [None]:
committee_yearly.shape

## Pro Parris Social Data Set

In [None]:
# First 5 Rows
pro.head()

In [None]:
# Last 5 rows
pro.tail()

In [None]:
print('Number of singers by identification category:')
pro['Identification'].value_counts()

In [None]:
print('Number of singers by sex:')
pro['Sex'].value_counts()

In [None]:
print('Number of male in identification:')
pro['Identification'][pro['Sex']=='M'].value_counts()

In [None]:
print('Number of female in identification:')
pro['Identification'][pro['Sex']=='F'].value_counts()

There are more Householder than Church-Member and more Male in Pro Parris. Therefore, there more males in Householder compare more females in Church-Member.

## Salem Village DataSet

In [None]:
# First 5 Rows
salem.head()

In [None]:
# Last 5 rows
salem.tail()

In [None]:
salem['Petition'].value_counts()

In [None]:
salem['Church to 1696'].value_counts()

## Tax Comparison Data Set

In [None]:
# First 5 Rows
tax.head()

In [None]:
# Last 5 rows
tax.tail()

In [None]:
tax['Petition'].value_counts()

In [None]:
# Value Counts for multi columns
tax[['1681','1690','1694','1695','1697','1700']].apply(pd.Series.value_counts)

In [None]:
# Too many NaN, so fill it up with 0
result = tax[['1681','1690','1694','1695','1697','1700']].apply(pd.value_counts).fillna(0)
result

In [None]:
print('List of Name in certain petition:')
pd.crosstab(tax['Name'], tax['Petition'])

In [None]:
# tax[['1681','1690','1694','1695','1697','1700']].sum(axis=1)
tax['Total'] = tax.iloc[:,2:7].sum(axis=1)
tax

In [None]:
print('List of Total Tax Paid:')
tax.sort_values('Total', ascending=False)

In [None]:
print('The top 5 tax payers:')
print(tax.sort_values('Total', ascending=False)[0:5])

## Towns Data Set

In [None]:
# First 5 Rows
towns.head()

In [None]:
# Last 5 rows
towns.tail()