In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns

The following data is from NYS Open data and lists active corporations since 1800. Full documentation can be found here: https://dev.socrata.com/foundry/data.ny.gov/vz7i-btsq

In [2]:
corps = pd.read_json ('https://data.ny.gov/resource/vz7i-btsq.json')

In [3]:
corps.head()

Unnamed: 0,chairman_address_1,chairman_address_2,chairman_city,chairman_name,chairman_state,chairman_zip,county,current_entity_name,dos_id,dos_process_address_1,...,location_city,location_name,location_state,location_zip,registered_agent_address_1,registered_agent_address_2,registered_agent_city,registered_agent_name,registered_agent_state,registered_agent_zip
0,,,,,,,KINGS,00:02:59 LLC,3211809,"656 UNION STREET, APT. 2",...,,,,,,,,,,
1,,,,,,,KINGS,000 LLC,4472700,304 MESEROLE ST #2A,...,,,,,,,,,,
2,,,,,,,SUFFOLK,"000 MEADOW LANE, LLC",4723823,34 PANTIGO ROAD,...,,,,,,,,,,
3,,,,,,,SUFFOLK,00168 HOME CORP.,5224240,28 PHEASANT RUN LANE,...,,,,,,,,,,
4,,,,,,,NEW YORK,"002 MERCURY TACOS, LLC",4610986,131 7TH AVE S,...,,,,,,,,,,


In [4]:
corps.describe()

Unnamed: 0,dos_id
count,1000.0
mean,3845011.0
std,1248298.0
min,11786.0
25%,3123032.0
50%,4195544.0
75%,4838516.0
max,5349008.0


In [5]:
corps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 30 columns):
chairman_address_1            98 non-null object
chairman_address_2            23 non-null object
chairman_city                 98 non-null object
chairman_name                 98 non-null object
chairman_state                98 non-null object
chairman_zip                  98 non-null object
county                        999 non-null object
current_entity_name           1000 non-null object
dos_id                        1000 non-null int64
dos_process_address_1         992 non-null object
dos_process_address_2         210 non-null object
dos_process_city              992 non-null object
dos_process_name              992 non-null object
dos_process_state             992 non-null object
dos_process_zip               990 non-null object
entity_type                   1000 non-null object
initial_dos_filing_date       1000 non-null object
jurisdiction                  1000 non-null obj

So here I would want to figure out what to do given the wide variation in NULLs across columns.

In [6]:
city_group = corps.groupby('location_city')

In [7]:
print(city_group)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x1a189c4358>


In [8]:
city_group.head()

Unnamed: 0,chairman_address_1,chairman_address_2,chairman_city,chairman_name,chairman_state,chairman_zip,county,current_entity_name,dos_id,dos_process_address_1,...,location_city,location_name,location_state,location_zip,registered_agent_address_1,registered_agent_address_2,registered_agent_city,registered_agent_name,registered_agent_state,registered_agent_zip
0,,,,,,,KINGS,00:02:59 LLC,3211809,"656 UNION STREET, APT. 2",...,,,,,,,,,,
1,,,,,,,KINGS,000 LLC,4472700,304 MESEROLE ST #2A,...,,,,,,,,,,
2,,,,,,,SUFFOLK,"000 MEADOW LANE, LLC",4723823,34 PANTIGO ROAD,...,,,,,,,,,,
3,,,,,,,SUFFOLK,00168 HOME CORP.,5224240,28 PHEASANT RUN LANE,...,,,,,,,,,,
4,,,,,,,NEW YORK,"002 MERCURY TACOS, LLC",4610986,131 7TH AVE S,...,,,,,,,,,,
10,10 RED OAK DRIVE,,SCOTIA,RICHARD P. SZMYR,NEW YORK,12302,SCHENECTADY,"007 ELECTRONICS NETWORK, INC.",2067108,10 RED OAK DRIVE,...,SCOTIA,RICHARD P. SZMYR,NEW YORK,12302,,,,,,
25,63 SKILLMAN AVENUE APT 1,,BROOKLYN,BENEDETTA AMADI,NEW YORK,11211,NEW YORK,010COMMUNICATIONS INC.,3785280,"63 SKILLMAN AVENUE, SUITE #1",...,BROOKLYN,010COMMUNICATIONS INC,NEW YORK,11211,,,,,,
33,61-20 185TH ST,#2,FRESH MEADOWS,ALLAN CHEMTOB,NEW YORK,11365,NEW YORK,011 SERVICES CORP.,4154481,61-20 185TH ST,...,FRESH MEADOWS,011 SERVICES CORP.,NEW YORK,11365,1 MAIDEN LANE 5TH FLOOR,,NEW YORK,"SPIEGEL & UTRERA, P.A., P.C.",NEW YORK,10038
34,200 S SERVICE RD,STE 211,ROSLYN HEIGHTS,PARMOD KUMAR,NEW YORK,11577,QUEENS,011 TELECOM INC.,2664830,200 S SERVICE RD,...,ROSLYN HEIGHTS,PARMOD KUMAR,NEW YORK,11577,,,,,,
51,812 58TH ST,,BROOKLYN,FANG LIN,NEW YORK,11220,KINGS,02 BEAUTY CENTER INC,3635057,812 58TH STREET,...,BROOKLYN,02 BEAUTY CENTER INC,NEW YORK,11220,,,,,,


In [11]:
city_group.describe()

Unnamed: 0_level_0,dos_id,dos_id,dos_id,dos_id,dos_id,dos_id,dos_id,dos_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
location_city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ALEXANDRIA BAY,1.0,1692930.0,,1692930.0,1692930.0,1692930.0,1692930.0,1692930.0
ASTORIA,1.0,2193682.0,,2193682.0,2193682.0,2193682.0,2193682.0,2193682.0
BAITING HOLLOW,1.0,1164657.0,,1164657.0,1164657.0,1164657.0,1164657.0,1164657.0
BAYPORT,1.0,3314005.0,,3314005.0,3314005.0,3314005.0,3314005.0,3314005.0
BAYSIDE,1.0,933983.0,,933983.0,933983.0,933983.0,933983.0,933983.0
BREWSTER,1.0,2571259.0,,2571259.0,2571259.0,2571259.0,2571259.0,2571259.0
BRONX,4.0,3394273.0,1054467.0,1823461.0,3327186.25,3830551.0,3897638.0,4092530.0
BROOKLYN,15.0,2475003.0,1191289.0,108714.0,1695267.0,2183633.0,3702509.0,3813470.0
CENTEREACH,1.0,2331187.0,,2331187.0,2331187.0,2331187.0,2331187.0,2331187.0
CHITTENANGO,1.0,3985227.0,,3985227.0,3985227.0,3985227.0,3985227.0,3985227.0


__question__ shouldn't this have given me the number of corporations in each city? also how are there so few cities given that there are many more in NYS (and 99 in the corp describe?)

In [12]:
city_group.count('kings')

TypeError: count() takes 1 positional argument but 2 were given

In [13]:
corp.count('kings')

NameError: name 'corp' is not defined

__question__ why is 'corp' not defined, when that's the name of the dataframe that i am pulling data from? 

In [14]:
corps.min(initial_dos_filing_date)

NameError: name 'initial_dos_filing_date' is not defined

__question__ it looks like all of my data is objects, maybe that is my problem? would i need to convert it either to strings in order to group etc? 