In [1]:
import pandas as pd 


In [5]:
# DataFrames are the workhorse datatype of the pandas library, a 2 dimensional iteratable table in which all the data from
# a CSV, excel, or other readable file type, can be organized and further inspected for cleaning, rearranging, or conducting
# analysis. The main way to make a dataframe you are likely to use is using the read_xxx() functions, which take the path
# to the file in question and interpret it into a dataframe. In the following example I am using a csv file that has the 
# results for a series of surveys of people trying to get into the data industry and another with the survey schema
foodproduction = pd.read_csv("world-food-production.csv")
foodproduction
survey = pd.read_csv(r"C:\Users\alberto\Desktop\Coding\Python\survey-data\survey_results_public.csv")
schema = pd.read_csv(r"C:\Users\alberto\Desktop\Coding\Python\survey-data\survey_results_schema.csv")
survey.head()
schema.shape[1]
survey.shape

(89184, 84)

In [8]:
# The set option function allows you to customize how certain functions in pandas operate, in this case I have set the 
# maximum number of columns to be printed out from the dataframe to 12, the frame still has all the columns, but now only
# the first 84 are shown when printing, in this case we get the number 6 from the .shape[1] of the survey, which gives 
# us the columns, as .shape returns [rows, columns] and the amount of rows is limited to 6. Truncated lines are replaced 
# with an elipsis
pd.set_option('display.max_columns', survey.shape[1])
pd.set_option('display.max_rows', schema.shape[0])
schema

Unnamed: 0,qid,qname,question,force_resp,type,selector
0,QID16,S0,"<div><span style=""font-size:19px;""><strong>Hel...",False,DB,TB
1,QID12,MetaInfo,Browser Meta Info,False,Meta,Browser
2,QID310,Q310,"<div><span style=""font-size:19px;""><strong>You...",False,DB,TB
3,QID312,Q120,,True,MC,SAVR
4,QID1,S1,"<span style=""font-size:22px; font-family: aria...",False,DB,TB
5,QID2,MainBranch,Which of the following options best describes ...,True,MC,SAVR
6,QID127,Age,What is your age? *,True,MC,MAVR
7,QID296,Employment,Which of the following best describes your cur...,False,MC,MAVR
8,QID308,RemoteWork,Which best describes your current work situation?,False,MC,SAVR
9,QID297,CodingActivities,Which of the following best describes the code...,False,MC,MAVR


In [9]:
# Pandas has a series of methods and functions built in specifically to manipulate the data stored in dataframes, here are
# some of the most common ones
foodproduction.shape
# The shape method gives back the dimensions of the dataframe as a tuple in (rows, columns)
foodproduction.info()
# The info() method gives you the dimensions and the datatypes of the columns, here it shows we have 24 columns and the
# datatypes o each column, as well as the number of elements in each column
foodproduction.head(10)
# The head method shows you the first 5 rows unless you specify a number of rows from your dataframe to read, tail does 
# the last 5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11912 entries, 0 to 11911
Data columns (total 24 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Entity                               11912 non-null  object 
 1   Year                                 11912 non-null  int64  
 2   Maize Production (tonnes)            11912 non-null  float64
 3   Rice  Production ( tonnes)           11912 non-null  float64
 4   Yams  Production (tonnes)            11912 non-null  float64
 5   Wheat Production (tonnes)            11912 non-null  float64
 6   Tomatoes Production (tonnes)         11912 non-null  float64
 7   Tea  Production ( tonnes )           11912 non-null  float64
 8   Sweet potatoes  Production (tonnes)  11912 non-null  float64
 9   Sunflower seed  Production (tonnes)  11912 non-null  float64
 10  Sugar cane Production (tonnes)       11912 non-null  float64
 11  Soybeans  Production (tonnes

Unnamed: 0,Entity,Year,Maize Production (tonnes),Rice Production ( tonnes),Yams Production (tonnes),Wheat Production (tonnes),Tomatoes Production (tonnes),Tea Production ( tonnes ),Sweet potatoes Production (tonnes),Sunflower seed Production (tonnes),Sugar cane Production (tonnes),Soybeans Production (tonnes),Rye Production (tonnes),Potatoes Production (tonnes),Oranges Production (tonnes),"Peas, dry Production ( tonnes)",Palm oil Production (tonnes),Grapes Production (tonnes),"Coffee, green Production ( tonnes)",Cocoa beans Production (tonnes),"Meat, chicken Production (tonnes)",Bananas Production ( tonnes),Avocados Production (tonnes),Apples Production (tonnes)
0,Afghanistan,1961,700000.0,319000.0,7467702.0,2279000.0,1873812.0,56315.0,3270871.0,12000.0,45000.0,71813.0,10290.0,130000.0,10100.0,232910.0,1131882.0,225000.0,870970.0,835368.0,5600.0,3139079.0,63439.0,15100.0
1,Afghanistan,1962,700000.0,319000.0,7420515.0,2279000.0,2044797.0,61519.0,3562524.0,12800.0,45000.0,84594.0,9100.0,115000.0,10100.0,259412.0,1111006.0,225000.0,883512.0,867170.0,6000.0,3181580.0,65118.0,15100.0
2,Afghanistan,1963,713000.0,319000.0,8479074.0,1947000.0,2096077.0,63596.0,3409916.0,12800.0,45000.0,87260.0,13800.0,122000.0,10100.0,251529.0,1145004.0,225000.0,996674.0,922621.0,6160.0,3304256.0,61760.0,15100.0
3,Afghanistan,1964,720000.0,380000.0,9113779.0,2230000.0,2388264.0,66604.0,3229336.0,12800.0,45000.0,76781.0,16100.0,129000.0,12400.0,247556.0,1160831.0,265000.0,1162048.0,1190061.0,6400.0,3392527.0,62759.0,18400.0
4,Afghanistan,1965,720000.0,380000.0,10067913.0,2282000.0,2559608.0,72418.0,3169104.0,13200.0,51000.0,73067.0,13900.0,132000.0,13700.0,266947.0,1138860.0,287000.0,1075084.0,874245.0,6800.0,3450849.0,66269.0,20400.0
5,Afghanistan,1966,720000.0,337000.0,10863614.0,2033000.0,2690984.0,90272.0,3214807.0,14000.0,51000.0,77180.0,8411.0,136000.0,15300.0,280862.0,1151649.0,315000.0,1199529.0,969648.0,7200.0,3563461.0,68331.0,22800.0
6,Afghanistan,1967,768000.0,396000.0,12123091.0,2280000.0,2580187.0,86111.0,3556573.0,14000.0,57000.0,80884.0,14301.0,147000.0,18500.0,275066.0,1100007.0,372000.0,1085561.0,982906.0,7600.0,3603535.0,71416.0,27600.0
7,Afghanistan,1968,773000.0,402000.0,12840044.0,2354000.0,2799236.0,100736.0,3601209.0,14400.0,57000.0,90459.0,9100.0,150000.0,18700.0,293310.0,1154021.0,375000.0,1183868.0,855617.0,8000.0,3732902.0,73906.0,27900.0
8,Afghanistan,1969,785000.0,407000.0,14496418.0,2454000.0,3001560.0,114363.0,3760912.0,14800.0,60000.0,89000.0,8100.0,154000.0,18900.0,274316.0,1195096.0,379000.0,1253504.0,1004663.0,9600.0,3914729.0,76704.0,28200.0
9,Afghanistan,1970,667000.0,366000.0,16413323.0,2081000.0,3111482.0,121398.0,4558892.0,14000.0,55000.0,97211.0,15160.0,144000.0,18300.0,284651.0,1077079.0,364000.0,1295001.0,1120835.0,9600.0,4076156.0,78464.0,27300.0


In [20]:
# another example of using .shape
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 5)
survey

Unnamed: 0,ResponseId,Q120,...,SurveyEase,ConvertedCompYearly
0,1,I agree,...,,
1,2,I agree,...,Easy,285000.0
...,...,...,...,...,...
89182,89183,I agree,...,Neither easy nor difficult,
89183,89184,I agree,...,Easy,
