In [1]:
!pip install streamlit --quiet

[K     |████████████████████████████████| 10.1 MB 4.2 MB/s 
[K     |████████████████████████████████| 164 kB 49.9 MB/s 
[K     |████████████████████████████████| 111 kB 47.1 MB/s 
[K     |████████████████████████████████| 77 kB 5.1 MB/s 
[K     |████████████████████████████████| 181 kB 50.1 MB/s 
[K     |████████████████████████████████| 4.3 MB 37.4 MB/s 
[K     |████████████████████████████████| 63 kB 1.3 MB/s 
[K     |████████████████████████████████| 131 kB 48.8 MB/s 
[K     |████████████████████████████████| 428 kB 49.2 MB/s 
[K     |████████████████████████████████| 130 kB 48.1 MB/s 
[K     |████████████████████████████████| 793 kB 48.0 MB/s 
[K     |████████████████████████████████| 381 kB 50.6 MB/s 
[?25h  Building wheel for blinker (setup.py) ... [?25l[?25hdone
  Building wheel for validators (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source 

In [2]:
!pip install newspaper3k --quiet

[K     |████████████████████████████████| 211 kB 4.0 MB/s 
[K     |████████████████████████████████| 7.4 MB 39.3 MB/s 
[K     |████████████████████████████████| 81 kB 8.7 MB/s 
[K     |████████████████████████████████| 93 kB 1.7 MB/s 
[?25h  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Building wheel for feedfinder2 (setup.py) ... [?25l[?25hdone
  Building wheel for jieba3k (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [11]:
%%writefile myapp.py


import streamlit as st
import nltk
nltk.download('punkt')
from newspaper import Article

st.title("INSTAGIVE!")
st.header("A Charity Recommendation Engine")


import pandas as pd                     # for data manipulation and analysis

#%matplotlib inline
import matplotlib.pyplot as plt        # object-oriented API for embedding plots into applications

import matplotlib                       
import numpy as np                     # used for working with arrays
import missingno as msno               # provides a series of visualisations to understand the presence and distribution of missing data within a pandas dataframe
import altair as alt                   # statistical visualization library
from vega_datasets import data         


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

import re
from sklearn.base import BaseEstimator, TransformerMixin
import nltk.stem


def recommend_charities(metrics, cause, charity_data, n=3):

    """
    The following code sorts a list of charities (of a given cause) on the basis of the above 3 metrics -- overall rating, financial rating and the rating foe accountability and transperancy.

    Top 'n' (n is currently set to 3) charities in the 3 sorted lists are recommended.
    """

    top_charities = {}

    for metric in metrics:
        top_charities[metric] = charity_data.groupby(['category']).get_group(cause).sort_values(by=[metric], ascending=False).head(n).copy()
    

    return top_charities


charity_data = pd.read_csv('/content/complete_data.csv') # update path link here

df = pd.read_csv('/content/charity_navigator.csv.txt')
# charity_data =
df.drop(['Unnamed: 0','charityid'], axis=1, inplace=True)     #no. ofcols available and which are dropped 

## Drop Empty Rows
df.dropna(axis=0, how='any', inplace=True)

pic = pd.DataFrame(df['category'].value_counts())
pic['name'] = pic.index.values.tolist()

threshold = 671

lst = []

for class_index, group in df.groupby('category'):
    if (threshold - len(group) > 0): # oversample
        lst.append(group)
        lst.append(group.sample(threshold - len(group), replace=True, random_state=1))

    elif (threshold - len(group) < 0): # under-sample
        lst.append(group.sample(threshold, replace=True, random_state=1))

    else:
        lst.append(group)

df_balanced = pd.concat(lst)

target_lst = ['animals', 'arts culture humanities' ,'community development',
              'education','environment']

df_balanced = df_balanced.loc[df_balanced['category'].isin(target_lst)]

target = 'category'

le = preprocessing.LabelEncoder()
le.fit(df[target])
#print(le.classes_)
df_balanced['target'] = le.transform(df_balanced[target]) 

# merge text to create a document
df_balanced['corpus'] = df_balanced.mission + df_balanced.tagline + df_balanced.cause
# drop other columns, convenience
df_balanced.drop(['mission', 'tagline', 'cause'], axis=1, inplace=True)

target_to_category = {
    0: "animals",
    1: "arts culture humanities",
    2: "community development",
    3: "education",
    4: "environment"
}

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_balanced.corpus)

# preprocessing step 
def drop_integers(s):
    return re.sub(r'\d+', '', s)

# stemmer
english_stemmer = nltk.stem.SnowballStemmer('english')       #visualization

def stemmer(doc):
    return [porter_stemmer.stem(w) for w in analyzer(doc)]

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        # will need to rewrite if pickled - due to lambda
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

vectorizer_s = StemmedCountVectorizer(min_df=5,  preprocessor=drop_integers,
                                      analyzer='word', stop_words='english') 

X = vectorizer_s.fit_transform(df_balanced.corpus)

vectorizer = TfidfTransformer()
X_tfidf = vectorizer.fit_transform(X)    #add vis. add to results, print words

seed = 2
X_train, X_test, y_train, y_test = train_test_split(df_balanced.corpus, df_balanced.target,
                                                    test_size=0.20,
                                                    random_state=seed, 
                                                    shuffle=True)

NB_pipeline = Pipeline([
    ('vect', StemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.01)),
])

LR_pipeline = Pipeline([
    ('vect', StemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=0, solver='lbfgs',
                               multi_class='multinomial')),
])

SVC_pipeline = Pipeline([
    ('vect', StemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

RF_pipeline = Pipeline([
    ('vect', StemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_estimators=100, 
                                   max_depth=2)),
])

SGD_pipeline = Pipeline([
    ('vect', StemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3)),        
])

## naive bayes
NB_pipeline.fit(X_train, y_train)
predictionNB = NB_pipeline.predict(X_test)

## Logistic Regression
LR_pipeline.fit(X_train, y_train)
predictionLR = LR_pipeline.predict(X_test)

## Suport Vector
SVC_pipeline.fit(X_train, y_train)
predictionSVC = SVC_pipeline.predict(X_test)

##Stochastic Gradient Descent
SGD_pipeline.fit(X_train,y_train)
predictionSGD = SGD_pipeline.predict(X_test)

# random forest
RF_pipeline.fit(X_train,y_train)
predictionRF = RF_pipeline.predict(X_test)

results = {'Algorithm': ['naive_bayes', 'logistic_regression','support_vector','gradient_descent','random_forest' ],
           'Accuracy': [accuracy_score(y_test, predictionNB),accuracy_score(y_test, predictionLR),accuracy_score(y_test, predictionSVC),accuracy_score(y_test, predictionSGD),accuracy_score(y_test, predictionRF)] }
res_df = pd.DataFrame(results)


link=st.text_input("Enter the Web-Article link:","")
st.markdown(f"Article link is: {link}")
st.markdown(f"Type of input link {type(link)}")

if (link == ''): st.markdown("Empty link.")

article = Article(link)
article.download()
article.parse()
article.nlp()

article_text=article.text


if (article_text == ''): st.markdown("Empty text.")
results=[]
results = RF_pipeline.predict([article_text])
#cause= target_to_category[results]
#print(results)
ans=""

for i in results:
       ans=target_to_category[i]
st.markdown(f""" ### The predicted cause is: {ans}""")

metrics = ['overall_rating', 'financial_rating', 'accountability_and_transperancy_rating']

recommended_charities = recommend_charities(metrics, ans, charity_data)

for metric in metrics:
    donation_link_list = recommended_charities[metric]['donation_link'].tolist()
    webadress_list = recommended_charities[metric]['web_address'].tolist()
    name_list = recommended_charities[metric]['charity_name'].tolist()
    # output = "Depending on "+str(metric)+" : " + " ".join(charity_list)
    output = "Depending on "+str(metric)+" :"
    st.markdown(output)

    for idx in range(3):
        st.write("[{name}]({web_address}). Click [here]({donation_link}) to donate.".format(name=name_list[idx], web_address=webadress_list[idx], donation_link=donation_link_list[idx]))

st.markdown(f"Article text: {article_text}")

#print("The predicted cause is: {cause}")

Overwriting myapp.py


In [13]:
 !streamlit run myapp.py & npx localtunnel --port 8501

2022-06-01 17:40:21.215 INFO    numexpr.utils: NumExpr defaulting to 2 threads.
[K[?25hnpx: installed 22 in 3.008s
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://35.229.150.75:8501[0m
[0m
your url is: https://all-emus-travel-35-229-150-75.loca.lt
[34m  Stopping...[0m
^C
