In [2]:
import pandas as pd

Depending on the file format, the following are the most used methods when importing into a Pandas DataFrame:
- pd.read_csv() - for CSV files(and .psv and .tsv)
- pd.read_excel() - for Excel files
- pd.read_json() - for JSON files
- pd.DataFrame.from_dict - from a Python dictionary

We can also export DataFrames to any format using the following methods:
- pd.to_csv() - to CSV files(and .psv and .tsv)
- pd.to_excel() - to Excel files
- pd.to_json() - to JSON files
- pd.to_dict - to a Python dictionary

In [2]:
# Import 'bp.txt' file
# delimiter '\t' - the file is tab delimited so a tsv would have worked too
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\bp.txt', 
                 delimiter = '\t')
df.head(3)

Unnamed: 0,Pt,BP,Age,Weight,BSA,Dur,Pulse,Stress
0,1,105,47,85.4,1.75,5.1,63,33
1,2,115,49,94.2,2.1,3.8,70,14
2,3,116,49,95.3,1.98,8.2,72,10


In [3]:
#To import only the first 100 rows of your file using nrows
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\ACS_16_5YR_B24011_with_ann.csv', 
              nrows = 100)
df.head(5)

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HD01_VD01,HD02_VD01,HD01_VD02,HD02_VD02,HD01_VD03,HD02_VD03,HD01_VD04,...,HD01_VD32,HD02_VD32,HD01_VD33,HD02_VD33,HD01_VD34,HD02_VD34,HD01_VD35,HD02_VD35,HD01_VD36,HD02_VD36
0,Id,Id2,Geography,Estimate; Total:,Margin of Error; Total:,"Estimate; Total: - Management, business, scien...","Margin of Error; Total: - Management, business...","Estimate; Total: - Management, business, scien...","Margin of Error; Total: - Management, business...","Estimate; Total: - Management, business, scien...",...,"Estimate; Total: - Natural resources, construc...","Margin of Error; Total: - Natural resources, c...","Estimate; Total: - Production, transportation,...","Margin of Error; Total: - Production, transpor...","Estimate; Total: - Production, transportation,...","Margin of Error; Total: - Production, transpor...","Estimate; Total: - Production, transportation,...","Margin of Error; Total: - Production, transpor...","Estimate; Total: - Production, transportation,...","Margin of Error; Total: - Production, transpor..."
1,0500000US01001,01001,"Autauga County, Alabama",33267,2306,48819,1806,55557,4972,63333,...,31402,5135,35594,3034,36059,3893,47266,13608,19076,4808
2,0500000US01003,01003,"Baldwin County, Alabama",31540,683,49524,1811,57150,6980,63422,...,35603,3882,30549,1606,29604,4554,35504,6260,24182,3580
3,0500000US01005,01005,"Barbour County, Alabama",26575,1653,41652,2638,51797,5980,52775,...,37847,11189,26094,4884,25339,4900,37282,6017,16607,3497
4,0500000US01007,01007,"Bibb County, Alabama",30088,2224,40787,2896,50069,12841,67917,...,45952,5622,28983,3401,31881,2317,26580,2901,23479,4942


In [4]:
# Lets remove the first row as it contains descriptions
df = df.drop(0)
df.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HD01_VD01,HD02_VD01,HD01_VD02,HD02_VD02,HD01_VD03,HD02_VD03,HD01_VD04,...,HD01_VD32,HD02_VD32,HD01_VD33,HD02_VD33,HD01_VD34,HD02_VD34,HD01_VD35,HD02_VD35,HD01_VD36,HD02_VD36
1,0500000US01001,1001,"Autauga County, Alabama",33267,2306,48819,1806,55557,4972,63333,...,31402,5135,35594,3034,36059,3893,47266,13608,19076,4808
2,0500000US01003,1003,"Baldwin County, Alabama",31540,683,49524,1811,57150,6980,63422,...,35603,3882,30549,1606,29604,4554,35504,6260,24182,3580
3,0500000US01005,1005,"Barbour County, Alabama",26575,1653,41652,2638,51797,5980,52775,...,37847,11189,26094,4884,25339,4900,37282,6017,16607,3497
4,0500000US01007,1007,"Bibb County, Alabama",30088,2224,40787,2896,50069,12841,67917,...,45952,5622,28983,3401,31881,2317,26580,2901,23479,4942
5,0500000US01009,1009,"Blount County, Alabama",34900,2063,46593,2963,47003,6189,50991,...,42489,7176,32969,3767,31814,4551,41375,5280,26755,2963


In [5]:
# If we had known beforehand, we could have used the skiprows argument
# Limimt to 100 rows
# Skip the first row
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\ACS_16_5YR_B24011_with_ann.csv',
                nrows = 100,
                skiprows = 1)
df.head()

Unnamed: 0,Id,Id2,Geography,Estimate; Total:,Margin of Error; Total:,"Estimate; Total: - Management, business, science, and arts occupations:","Margin of Error; Total: - Management, business, science, and arts occupations:","Estimate; Total: - Management, business, science, and arts occupations: - Management, business, and financial occupations:","Margin of Error; Total: - Management, business, science, and arts occupations: - Management, business, and financial occupations:","Estimate; Total: - Management, business, science, and arts occupations: - Management, business, and financial occupations: - Management occupations",...,"Estimate; Total: - Natural resources, construction, and maintenance occupations: - Installation, maintenance, and repair occupations","Margin of Error; Total: - Natural resources, construction, and maintenance occupations: - Installation, maintenance, and repair occupations","Estimate; Total: - Production, transportation, and material moving occupations:","Margin of Error; Total: - Production, transportation, and material moving occupations:","Estimate; Total: - Production, transportation, and material moving occupations: - Production occupations","Margin of Error; Total: - Production, transportation, and material moving occupations: - Production occupations","Estimate; Total: - Production, transportation, and material moving occupations: - Transportation occupations","Margin of Error; Total: - Production, transportation, and material moving occupations: - Transportation occupations","Estimate; Total: - Production, transportation, and material moving occupations: - Material moving occupations","Margin of Error; Total: - Production, transportation, and material moving occupations: - Material moving occupations"
0,0500000US01001,1001,"Autauga County, Alabama",33267,2306,48819,1806,55557,4972,63333,...,31402,5135,35594,3034,36059,3893,47266,13608,19076,4808
1,0500000US01003,1003,"Baldwin County, Alabama",31540,683,49524,1811,57150,6980,63422,...,35603,3882,30549,1606,29604,4554,35504,6260,24182,3580
2,0500000US01005,1005,"Barbour County, Alabama",26575,1653,41652,2638,51797,5980,52775,...,37847,11189,26094,4884,25339,4900,37282,6017,16607,3497
3,0500000US01007,1007,"Bibb County, Alabama",30088,2224,40787,2896,50069,12841,67917,...,45952,5622,28983,3401,31881,2317,26580,2901,23479,4942
4,0500000US01009,1009,"Blount County, Alabama",34900,2063,46593,2963,47003,6189,50991,...,42489,7176,32969,3767,31814,4551,41375,5280,26755,2963


In [6]:
# The header argument  specifies where the column names are
# It starts importing from there
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\ACS_16_5YR_B24011_with_ann.csv',
                header = 1)
df.head()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf1 in position 23033: invalid continuation byte

In [7]:
# Encoding errors like above have something to do with 
# how the strings within the file are formatted
# Most common is utf-8. Try latin-1
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\ACS_16_5YR_B24011_with_ann.csv',
                encoding = 'latin-1',
                header = 1)
df.head()

Unnamed: 0,Id,Id2,Geography,Estimate; Total:,Margin of Error; Total:,"Estimate; Total: - Management, business, science, and arts occupations:","Margin of Error; Total: - Management, business, science, and arts occupations:","Estimate; Total: - Management, business, science, and arts occupations: - Management, business, and financial occupations:","Margin of Error; Total: - Management, business, science, and arts occupations: - Management, business, and financial occupations:","Estimate; Total: - Management, business, science, and arts occupations: - Management, business, and financial occupations: - Management occupations",...,"Estimate; Total: - Natural resources, construction, and maintenance occupations: - Installation, maintenance, and repair occupations","Margin of Error; Total: - Natural resources, construction, and maintenance occupations: - Installation, maintenance, and repair occupations","Estimate; Total: - Production, transportation, and material moving occupations:","Margin of Error; Total: - Production, transportation, and material moving occupations:","Estimate; Total: - Production, transportation, and material moving occupations: - Production occupations","Margin of Error; Total: - Production, transportation, and material moving occupations: - Production occupations","Estimate; Total: - Production, transportation, and material moving occupations: - Transportation occupations","Margin of Error; Total: - Production, transportation, and material moving occupations: - Transportation occupations","Estimate; Total: - Production, transportation, and material moving occupations: - Material moving occupations","Margin of Error; Total: - Production, transportation, and material moving occupations: - Material moving occupations"
0,0500000US01001,1001,"Autauga County, Alabama",33267,2306,48819,1806,55557,4972,63333,...,31402,5135,35594,3034,36059,3893,47266,13608,19076,4808
1,0500000US01003,1003,"Baldwin County, Alabama",31540,683,49524,1811,57150,6980,63422,...,35603,3882,30549,1606,29604,4554,35504,6260,24182,3580
2,0500000US01005,1005,"Barbour County, Alabama",26575,1653,41652,2638,51797,5980,52775,...,37847,11189,26094,4884,25339,4900,37282,6017,16607,3497
3,0500000US01007,1007,"Bibb County, Alabama",30088,2224,40787,2896,50069,12841,67917,...,45952,5622,28983,3401,31881,2317,26580,2901,23479,4942
4,0500000US01009,1009,"Blount County, Alabama",34900,2063,46593,2963,47003,6189,50991,...,42489,7176,32969,3767,31814,4551,41375,5280,26755,2963


In [8]:
# You can also select the specific columns you want to load
# Using usecols and print a list of numbers
# correct encoding
# header row = 1
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\ACS_16_5YR_B24011_with_ann.csv',
              usecols = [0,1,2,3,4,5,6,7],
              header = 1,
              encoding = 'latin-1')
df.head()

Unnamed: 0,Id,Id2,Geography,Estimate; Total:,Margin of Error; Total:,"Estimate; Total: - Management, business, science, and arts occupations:","Margin of Error; Total: - Management, business, science, and arts occupations:","Estimate; Total: - Management, business, science, and arts occupations: - Management, business, and financial occupations:"
0,0500000US01001,1001,"Autauga County, Alabama",33267,2306,48819,1806,55557
1,0500000US01003,1003,"Baldwin County, Alabama",31540,683,49524,1811,57150
2,0500000US01005,1005,"Barbour County, Alabama",26575,1653,41652,2638,51797
3,0500000US01007,1007,"Bibb County, Alabama",30088,2224,40787,2896,50069
4,0500000US01009,1009,"Blount County, Alabama",34900,2063,46593,2963,47003


In [9]:
# You can also use specific column names
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\ACS_16_5YR_B24011_with_ann.csv',
                usecols = ['Id', 'Id2'],
                encoding = 'latin-1',
                header = 1)
# When using header = 1, use the col that you want as your new col
df.head(3)

Unnamed: 0,Id,Id2
0,0500000US01001,1001
1,0500000US01003,1003
2,0500000US01005,1005


In [10]:
# You can also select specific sheets for Excel files 
# using the index number
# header is in the second row
df1 = pd.read_excel(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\Yelp_Selected_Businesses.xlsx',
                   header = 2)
df1.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,RESDUcs7fIiihp38-d6_6g,0,2015-09-16,0,gkcPdbblTvZDMSwx8nVEKw,5,Got here early on football Sunday 7:30am as I ...,0,SKteB5rgDlkkUa1Zxe1N0Q
1,RESDUcs7fIiihp38-d6_6g,0,2017-09-09,0,mQfl6ci46mu0xaZrkRUhlA,5,"This buffet is amazing. Yes, it is expensive,...",0,f638AHA_GoHbyDB7VFMz7A
2,RESDUcs7fIiihp38-d6_6g,0,2013-01-14,0,EJ7DJ8bm7-2PLFB9WKx4LQ,3,I was really looking forward to this but it wa...,0,-wVPuTiIEG85LwTK46Prpw
3,RESDUcs7fIiihp38-d6_6g,0,2017-02-08,0,lMarDJDg4-e_0YoJOKJoWA,2,This place....lol our server was nice. But fo...,0,A21zMqdN76ueLZFpmbue0Q
4,RESDUcs7fIiihp38-d6_6g,0,2012-11-19,0,nq_-8lZPUVGomDEP5OOj1Q,1,"After hearing all the buzz about this place, I...",2,Jf1EXieUV7F7s-HGA4EsdA


In [11]:
# Import a specific Excel sheet
# Select only from the second sheet
df1 = pd.read_excel(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\Yelp_Selected_Businesses.xlsx',
                   header = 2,
                   sheet_name = 2)
df1.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,YJ8ljUhLsz6CtT_2ORNFmg,1,2013-04-25,0,xgUz0Ck4_ciNaeIk-H8GBQ,5,I loved this place. Easily the most hipsters p...,1,6cpo8iqgnW3jnozhmY7eAA
1,YJ8ljUhLsz6CtT_2ORNFmg,0,2014-07-07,0,Au7MG4QlAxqq9meyKSQmaw,5,So my boyfriend and I came here for my birthda...,0,8bFE3u1dMoYXkS7ORqlssw
2,YJ8ljUhLsz6CtT_2ORNFmg,0,2015-12-04,0,8IQnZ54nenXjlK-FGZ82Bg,5,I really enjoyed their food. Went there for th...,1,bJmE1ms0MyZ6KHjmfZDWGw
3,YJ8ljUhLsz6CtT_2ORNFmg,2,2016-07-06,1,XY42LMhKoXzwtLoku4mvLA,5,A complete Vegas experience. We arrived right ...,3,PbccpC-I-8rxzF2bCDh8YA
4,YJ8ljUhLsz6CtT_2ORNFmg,0,2014-04-15,0,1xlYVWhyLedoA0HddOJMOw,4,Very great atmosphere had a wonderful bartende...,0,yvlRColhqo_4TzpUFKyroA


In [12]:
# You can also use the name of the sheet
# Import a specific Excel sheet
# Select only from the second sheet
df1 = pd.read_excel(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\Yelp_Selected_Businesses.xlsx',
                   header = 2,
                   sheet_name = 'Biz_id_RESDU')
df1.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,RESDUcs7fIiihp38-d6_6g,0,2015-09-16,0,gkcPdbblTvZDMSwx8nVEKw,5,Got here early on football Sunday 7:30am as I ...,0,SKteB5rgDlkkUa1Zxe1N0Q
1,RESDUcs7fIiihp38-d6_6g,0,2017-09-09,0,mQfl6ci46mu0xaZrkRUhlA,5,"This buffet is amazing. Yes, it is expensive,...",0,f638AHA_GoHbyDB7VFMz7A
2,RESDUcs7fIiihp38-d6_6g,0,2013-01-14,0,EJ7DJ8bm7-2PLFB9WKx4LQ,3,I was really looking forward to this but it wa...,0,-wVPuTiIEG85LwTK46Prpw
3,RESDUcs7fIiihp38-d6_6g,0,2017-02-08,0,lMarDJDg4-e_0YoJOKJoWA,2,This place....lol our server was nice. But fo...,0,A21zMqdN76ueLZFpmbue0Q
4,RESDUcs7fIiihp38-d6_6g,0,2012-11-19,0,nq_-8lZPUVGomDEP5OOj1Q,1,"After hearing all the buzz about this place, I...",2,Jf1EXieUV7F7s-HGA4EsdA


In [13]:
# To load a full workbook
# Use pd.ExcelFile
# First get the name of the Excel sheets
workbook = pd.ExcelFile(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas\Data\Yelp_Selected_Businesses.xlsx')
workbook.sheet_names

['Biz_id_RESDU',
 'Biz_id_4JNXU',
 'Biz_id_YJ8lj',
 'Biz_id_ujHia',
 'Biz_id_na4Th']

In [14]:
#Then import a specific sheet
df = workbook.parse(sheet_name = 1, header = 2)
df.head(4)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,4JNXUYY8wbaaDmk3BPzlWw,0,2012-06-10,0,wl8BO_I-is-JaMwMW5c_gQ,4,I booked a table here for brunch and it did no...,0,fo4mpUqgXL2mJqALc9AvbA
1,4JNXUYY8wbaaDmk3BPzlWw,0,2012-01-20,0,cf9RrqHY9eQ9M53OPyXLtg,4,Came here for lunch after a long night of part...,0,TVvTtXwPXsvrg2KJGoOUTg
2,4JNXUYY8wbaaDmk3BPzlWw,0,2017-05-10,0,BvmhSQ6WFm2Jxu01G8OpdQ,5,Loved the fried goat cheese in tomato sauce al...,0,etbAVunw-4kwr6VTRweZpA
3,4JNXUYY8wbaaDmk3BPzlWw,0,2014-05-03,0,IoKp9n1489XohTV_-EJ0IQ,5,"Love the outdoor atmosphere. Price was right, ...",0,vKXux2Xx3xcicTgYZoR0pg


In [15]:
# Once done, you can save our data back to a csv/excel file
# df.to_csv("NewFileName.csv", index = False)
# index = False: used if we don't want the index included in the new file

## Importing Pandas Lab

In [25]:
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas-lab\Data\Zipcode_Demos.csv',
                )
df

Unnamed: 0,0,Average Statistics,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46
0,1,,0,,,,,,,,...,,,,,,,,,,
1,2,JURISDICTION NAME,10005.8,,,,,,,,...,,,,,,,,,,
2,3,COUNT PARTICIPANTS,9.4,,,,,,,,...,,,,,,,,,,
3,4,COUNT FEMALE,4.8,,,,,,,,...,,,,,,,,,,
4,5,PERCENT FEMALE,0.404,,,,,,,,...,,,,,,,,,,
5,6,COUNT MALE,4.6,,,,,,,,...,,,,,,,,,,
6,7,PERCENT MALE,0.396,,,,,,,,...,,,,,,,,,,
7,8,COUNT GENDER UNKNOWN,0,,,,,,,,...,,,,,,,,,,
8,9,PERCENT GENDER UNKNOWN,0,,,,,,,,...,,,,,,,,,,
9,10,COUNT GENDER TOTAL,9.4,,,,,,,,...,,,,,,,,,,


In [4]:
df1 = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas-lab\Data\Zipcode_Demos.csv',
                header = 47)
df1

Unnamed: 0,47,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,...,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL
0,48,10001,44,22,0.5,22,0.5,0,0,44,...,44,100,20,0.45,24,0.55,0,0,44,100
1,49,10002,35,19,0.54,16,0.46,0,0,35,...,35,100,2,0.06,33,0.94,0,0,35,100
2,50,10003,1,1,1.0,0,0.0,0,0,1,...,1,100,0,0.0,1,1.0,0,0,1,100
3,51,10004,0,0,0.0,0,0.0,0,0,0,...,0,0,0,0.0,0,0.0,0,0,0,0
4,52,10005,2,2,1.0,0,0.0,0,0,2,...,2,100,0,0.0,2,1.0,0,0,2,100
5,53,10006,6,2,0.33,4,0.67,0,0,6,...,6,100,0,0.0,6,1.0,0,0,6,100
6,54,10007,1,0,0.0,1,1.0,0,0,1,...,1,100,1,1.0,0,0.0,0,0,1,100
7,55,10009,2,0,0.0,2,1.0,0,0,2,...,2,100,0,0.0,2,1.0,0,0,2,100
8,56,10010,0,0,0.0,0,0.0,0,0,0,...,0,0,0,0.0,0,0.0,0,0,0,0
9,57,10011,3,2,0.67,1,0.33,0,0,3,...,3,100,0,0.0,3,1.0,0,0,3,100


In [29]:
df1.drop(columns = '47', axis = 1, inplace = True)

In [30]:
df1

Unnamed: 0,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,...,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL
0,10001,44,22,0.5,22,0.5,0,0,44,100,...,44,100,20,0.45,24,0.55,0,0,44,100
1,10002,35,19,0.54,16,0.46,0,0,35,100,...,35,100,2,0.06,33,0.94,0,0,35,100
2,10003,1,1,1.0,0,0.0,0,0,1,100,...,1,100,0,0.0,1,1.0,0,0,1,100
3,10004,0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0.0,0,0.0,0,0,0,0
4,10005,2,2,1.0,0,0.0,0,0,2,100,...,2,100,0,0.0,2,1.0,0,0,2,100
5,10006,6,2,0.33,4,0.67,0,0,6,100,...,6,100,0,0.0,6,1.0,0,0,6,100
6,10007,1,0,0.0,1,1.0,0,0,1,100,...,1,100,1,1.0,0,0.0,0,0,1,100
7,10009,2,0,0.0,2,1.0,0,0,2,100,...,2,100,0,0.0,2,1.0,0,0,2,100
8,10010,0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0.0,0,0.0,0,0,0,0
9,10011,3,2,0.67,1,0.33,0,0,3,100,...,3,100,0,0.0,3,1.0,0,0,3,100


In [35]:
# df2 = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas-lab\Data\Yelp_Reviews_Corrupt.csv',
#                  sep =',')
# df2

ParserError: Error tokenizing data. C error: Expected 10 fields in line 2331, saw 11


In [36]:
with open(r'C:\Users\nrmmw\Documents\Flatiron\dsc-importing-data-using-pandas-lab\Data\Yelp_Reviews_Corrupt.csv', 
          'r') as f:
    for i, line in enumerate(f):
        if i == 2330:  # line numbers start at 0
            print(line)

3582,4twpbw7n4DmsLxAm6-sMkg,0,2014-05-11,0,UYkfC4JAkT0BZ324lipDeg,5,One of my go to Korean Food Restaurants downtown! I'm never disappointed when I go there, and the food always hits the spot.,0,dlfHwrLCzleDjqg3ETFhDQ



Yeah I couldn't do this corrupt thing. I would probably just change line 2330 manually and try again