# Cleaning and analyzing Berkeley "Here Lived" Japanese internment data

### First, cleaning!

In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# For this analysis, we can convert some values into null
df = pd.read_csv("here_lived_berkeley_2024-09-29.csv", 
    na_values=["0000 unknown address", "unknown"])


In [3]:
df.head()

Unnamed: 0,Last,First,Middle,Full Name,Address,Born,Imprisoned,Assembly Center and Camp,Release Date,Release Destination,Occupation,Notes
0,(Akiyoshi),Roy,Jitsuzo,Roy Jitsuzo (Akiyoshi),1610 Julia,"1886, Japan",April 1942,"Tanforan, Topaz","October 19, 1945",Oakland,Gardener,
1,(Akiyoshi),Miwa,Jitsuzo,Miwa Jitsuzo (Akiyoshi),1610 Julia,"1901, Japan",April 1942,"Tanforan, Topaz","October 19, 1945",Oakland,Maid,
2,(Akiyoshi),Minoru,Jitsuzo,Minoru Jitsuzo (Akiyoshi),1610 Julia,"1925, California",April 1942,"Tanforan, Topaz","July 5, 1945",New York City,,
3,(Akiyoshi),Shizuye,Jitsuzo,Shizuye Jitsuzo (Akiyoshi),1610 Julia,"1923, California",April 1942,"Tanforan, Topaz","August 6, 1945","Chicago, Illinois",,
4,(Suzuki),Fumiko,Hamada,Fumiko Hamada (Suzuki),2115 Parker,"1923, Mexico",April 1942,"Tanforan, Topaz","August 14, 1944",Milwaukee,,


In [4]:
df.shape

(1151, 12)

In [5]:
df.dtypes

Last                        object
First                       object
Middle                      object
Full Name                   object
Address                     object
Born                        object
Imprisoned                  object
Assembly Center and Camp    object
Release Date                object
Release Destination         object
Occupation                  object
Notes                       object
dtype: object

Ok, first, the address column doesn't include city or states. It just has the street address. To be able to map these data points, I want to add "Berkeley, CA" to each address string.

In [6]:
df['Address'] = df['Address'].astype(str) + ', Berkeley, CA'

In [7]:
df.head()

Unnamed: 0,Last,First,Middle,Full Name,Address,Born,Imprisoned,Assembly Center and Camp,Release Date,Release Destination,Occupation,Notes
0,(Akiyoshi),Roy,Jitsuzo,Roy Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1886, Japan",April 1942,"Tanforan, Topaz","October 19, 1945",Oakland,Gardener,
1,(Akiyoshi),Miwa,Jitsuzo,Miwa Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1901, Japan",April 1942,"Tanforan, Topaz","October 19, 1945",Oakland,Maid,
2,(Akiyoshi),Minoru,Jitsuzo,Minoru Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1925, California",April 1942,"Tanforan, Topaz","July 5, 1945",New York City,,
3,(Akiyoshi),Shizuye,Jitsuzo,Shizuye Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1923, California",April 1942,"Tanforan, Topaz","August 6, 1945","Chicago, Illinois",,
4,(Suzuki),Fumiko,Hamada,Fumiko Hamada (Suzuki),"2115 Parker, Berkeley, CA","1923, Mexico",April 1942,"Tanforan, Topaz","August 14, 1944",Milwaukee,,


This would have changed null values to be just ', Berkeley, CA', which I don't want. So I will set those to be null again.

In [8]:
df['Address'] = df['Address'].replace('nan, Berkeley, CA', np.nan)

In [9]:
df.value_counts('Address')

Address
2076 Ashby, Berkeley, CA                         21
2928 Grove, Berkeley, CA                         14
2809 California, Berkeley, CA                    13
2213 4th Street, Berkeley, CA                    13
1535 Oregon, Berkeley, CA                        11
                                                 ..
2039 Berkeley Way, Berkeley, CA                   1
2639 McGee, Berkeley, CA                          1
2714 McGee (corner of Delaware), Berkeley, CA     1
2747 Haste, Berkeley, CA                          1
2727 Garber, Berkeley, CA                         1
Name: count, Length: 266, dtype: int64

21 people at 2076 Ashby.

In [10]:
df['Address'].isnull().value_counts()

Address
False    1131
True       20
Name: count, dtype: int64

20 null address values.

### Next... Geocoding these addresses!

In [13]:
import geocoder
import os
import time
from dotenv import load_dotenv

In [12]:
load_dotenv()

api_key = os.getenv("api_key")

In [14]:
# A function to get the latitude and longitude
def get_lat_lng(address):
    try:
        g = geocoder.google(address, key=api_key)
        if g.ok:
            return g.latlng 
        else:
            return [None, None]
    except Exception as e:
        print(f"Error geocoding {address}: {e}")
        return [None, None]

In [16]:
latitudes = []
longitudes = []

for address in df['Address']:
    latlng = get_lat_lng(address)
    latitudes.append(latlng[0])
    longitudes.append(latlng[1])
    time.sleep(0.2)  # in case of API limits

# Add to the DataFrame
df['Latitude'] = latitudes
df['Longitude'] = longitudes

# Output the results
df.head()

Unnamed: 0,Last,First,Middle,Full Name,Address,Born,Imprisoned,Assembly Center and Camp,Release Date,Release Destination,Occupation,Notes,Latitude,Longitude
0,(Akiyoshi),Roy,Jitsuzo,Roy Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1886, Japan",April 1942,"Tanforan, Topaz","October 19, 1945",Oakland,Gardener,,37.854333,-122.276471
1,(Akiyoshi),Miwa,Jitsuzo,Miwa Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1901, Japan",April 1942,"Tanforan, Topaz","October 19, 1945",Oakland,Maid,,37.854333,-122.276471
2,(Akiyoshi),Minoru,Jitsuzo,Minoru Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1925, California",April 1942,"Tanforan, Topaz","July 5, 1945",New York City,,,37.854333,-122.276471
3,(Akiyoshi),Shizuye,Jitsuzo,Shizuye Jitsuzo (Akiyoshi),"1610 Julia, Berkeley, CA","1923, California",April 1942,"Tanforan, Topaz","August 6, 1945","Chicago, Illinois",,,37.854333,-122.276471
4,(Suzuki),Fumiko,Hamada,Fumiko Hamada (Suzuki),"2115 Parker, Berkeley, CA","1923, Mexico",April 1942,"Tanforan, Topaz","August 14, 1944",Milwaukee,,,37.862476,-122.266321


Great! Now I'll download this goecoded dataframe.

In [17]:
df.to_csv("geocoded_here_lived.csv", index=False)

For now, this might be all we need for the mapping part of this project...

The analysis of dates and occupation categories can come later.

Mapping the camps (and sizes) and the release destinations could also come later.