# Count the number of unique values in a certain column of a DF using a function

## Using a function

In [156]:
import pandas as pd
import numpy as np

In [157]:
df = pd.read_csv('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/seattle_weather.csv')

In [158]:
def col_count(df, col_name):
    """Return a dictionary with counts of unique entries in a column"""
    col_name = str.upper(col_name)

    if col_name not in df.columns:
        raise ValueError (col_name + " is not in the DataFrame")

    count_dict = {}

    temp_col = df[col_name]
    for entry in temp_col:
        if entry in count_dict.keys():
            count_dict[entry] += 1 
        else:
            count_dict[entry] = 1 
    return count_dict


In [159]:
return_var = col_count(df, 'Station')

In [160]:
return_var

{'USC00456295': 12,
 'USW00024222': 12,
 'USW00024233': 12,
 'USC00458278': 12,
 'USW00094274': 12,
 'USC00459021': 12,
 'USW00094290': 12,
 'USW00024234': 12,
 'USC00458508': 12,
 'USC00454486': 12,
 'USW00094248': 12,
 'USC00451233': 12,
 'USC00452675': 12,
 'USC00455525': 12,
 'USC00457773': 12,
 'USC00450872': 12,
 'USC00454169': 12}

In [161]:
total = 0 
for value in return_var.values():
    total += value 
print(total)

204


In [162]:
return_var_df = pd.DataFrame(return_var.items(), columns=['Location','Count'])

In [163]:
return_var_df

Unnamed: 0,Location,Count
0,USC00456295,12
1,USW00024222,12
2,USW00024233,12
3,USC00458278,12
4,USW00094274,12
5,USC00459021,12
6,USW00094290,12
7,USW00024234,12
8,USC00458508,12
9,USC00454486,12


## Using Pandas

In [164]:
df = pd.read_csv('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/seattle_weather.csv')
df['STATION'].value_counts()

USC00456295    12
USC00454486    12
USC00450872    12
USC00457773    12
USC00455525    12
USC00452675    12
USC00451233    12
USW00094248    12
USC00458508    12
USW00024222    12
USW00024234    12
USW00094290    12
USC00459021    12
USW00094274    12
USC00458278    12
USW00024233    12
USC00454169    12
Name: STATION, dtype: int64

# Filter function to select data based off characters

In [165]:
df = pd.read_csv('/Users/joseservin/DataCamp/Courses/Python_Toolbox/tweets.csv')

In [166]:
results = filter(lambda x: x[0:2] == 'RT', df['text'])

In [167]:
type(results)

filter

In [168]:
for tweet in list(results):
    print(tweet)

RT @bpolitics: .@krollbondrating's Christopher Whalen says Clinton is the weakest Dem candidate in 50 years https://t.co/pLk7rvoRSn https:/…
RT @HeidiAlpine: @dmartosko Cruz video found.....racing from the scene.... #cruzsexscandal https://t.co/zuAPZfQDk3
RT @AlanLohner: The anti-American D.C. elites despise Trump for his America-first foreign policy. Trump threatens their gravy train. https:…
RT @BIackPplTweets: Young Donald trump meets his neighbor  https://t.co/RFlu17Z1eE
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @HouseCracka: 29,000+ PEOPLE WATCHING TRUMP LIVE ON ONE STREAM!!!

https://t.co/7QCFz9ehNe
RT @urfavandtrump: RT for Brendon Urie
Fav for Donald Trump https://t.co/PZ5vS94lOg
RT @trapgrampa: This is how I see #Trump every time he speaks. https://t.co/fYSiHNS0nT
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @Pjw20161951: NO KIDDING: #SleazyDonald just attacked Scott Walker for NOT RAISI

# Loading Larges Amounts of Data

In [169]:
df = pd.read_csv('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/climate_change.csv')

In [170]:
df.dtypes

date              object
co2              float64
relative_temp    float64
dtype: object

In [171]:
df.co2.sum()

246269.22000000003

## Using Chunksize

In [172]:
total_co2 = []

for chunk in pd.read_csv('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/climate_change.csv', chunksize=50):
    val = chunk['co2'].sum()
    total_co2.append(val)


In [173]:
np.sum(total_co2)

246269.2200000001

# Processing Data in Chunks 

In [174]:
# Open a connection to the file
with open('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/austin_weather.csv') as file:

    # Skip the column names
    file.readline()

    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Process only the first 1000 rows
    for j in range(3):

        # Split the current line into a list: line
        line = file.readline().split(',')

        # Get the value for the first column: first_col
        station = line[0]

        # If the column value is in the dict, increment its value
        if station in counts_dict.keys():
            counts_dict[station] += 1

        # Else, add to the dict and set value to 1
        else:
            counts_dict[station] = 1

# Print the resulting dictionary
print(counts_dict)

{'"USW00013904"': 3}


In [175]:
# Open a connection to the file
with open('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/seattle_weather.csv') as file:

    # Skip the column names
    file.readline()

    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Process only the first 1000 rows
    for j in range(50):

        # Split the current line into a list: line
        line = file.readline().split(',')

        # Get the value for the first column: first_col
        station = line[0]

        # If the column value is in the dict, increment its value
        if station in counts_dict.keys():
            counts_dict[station] += 1

        # Else, add to the dict and set value to 1
        else:
            counts_dict[station] = 1

# Print the resulting dictionary
print(counts_dict)

{'"USC00456295"': 12, '"USW00024222"': 12, '"USW00024233"': 12, '"USC00458278"': 12, '"USW00094274"': 2}


In [176]:
df = pd.read_csv('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/austin_weather.csv')

In [177]:
df.dtypes

STATION                     object
NAME                        object
DATE                         int64
MLY-CLDD-BASE45              int64
MLY-CLDD-BASE50              int64
                            ...   
MLY-TMIN-PRBOCC-LSTH024      int64
MLY-TMIN-PRBOCC-LSTH028      int64
MLY-TMIN-PRBOCC-LSTH032      int64
MLY-TMIN-PRBOCC-LSTH036      int64
MLY-TMIN-STDDEV            float64
Length: 67, dtype: object

In [178]:
df['MLY-CLDD-BASE45'].sum()

8290

In [179]:
# Open a connection to the file
with open('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/austin_weather.csv') as file:

    # Skip the column names
    file.readline()

    # Initialize an empty dictionary: counts_dict
    cloud_base45_total = []

    # Process only the first 1000 rows
    for j in range(12):

        # Split the current line into a list: line
        line = file.readline().split(',')

        # Get the value for the first column: first_col
        cloud_base45 = line[4]
        cloud_base45_new = cloud_base45.replace('"','')
        cloud_base45_new = int(float(cloud_base45_new))
        cloud_base45_total.append(cloud_base45_new)
# Print the resulting dictionary
cloud_base45_total

[190, 228, 446, 668, 936, 1081, 1192, 1199, 985, 745, 412, 208]

In [180]:
sum(cloud_base45_total)

8290

# Processing data line by line 

In [183]:
# Open a connection to the file
with open('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/austin_weather.csv') as file:

    # Skip the column names
    file.readline()
    # Process only the first 1000 rows
    for j in range(2):
        print(file.readline())


"USW00013904","AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US","01","190","103","50","35","18","1","-7777","5","25.8","3.9","308","190","103","50","35","18","5","1","-7777","244","214","22","59","128","229","277","353","495","0.83","1.48","2.76","7.2","3.9","1.2","0.8","2.20","49.2","3.0","299","25.8","185","90","8","0.0","0.0","0.1","62.1","4.1","0.0","1","15","11.9","203","275","302","310","36.3","298","570","839","967","997","1000","2.9"

"USW00013904","AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US","02","228","132","68","49","29","3","1","11","25.3","3.8","349","228","132","68","49","29","11","3","1","262","231","10","29","73","148","186","250","373","1.00","2.00","3.23","6.9","3.2","1.2","0.3","2.31","52.1","3.4","273","25.3","195","99","17","0.1","0.0","0.1","64.8","4.6","0.0","0","4","7.9","152","239","264","280","39.4","103","327","614","867","973","999","3.2"

