In [1]:
pip install requests pandas



In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import requests
import pandas as pd
from collections import Counter
import re
import os
import time
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

class HackathonProjectAnalyzer:
    def __init__(self, token=None):
        """
        Initialize the analyzer with an optional GitHub token.

        Args:
            token (str, optional): GitHub API token for higher rate limits
        """
        self.base_url = "https://api.github.com"
        self.headers = {}
        if token:
            self.headers["Authorization"] = f"token {token}"

        # Download NLTK resources explicitly
        print("Downloading required NLTK resources...")
        nltk.download('punkt')
        nltk.download('stopwords')

        # Import these after downloading
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize
        self.word_tokenize = word_tokenize
        self.stop_words = set(stopwords.words('english'))

    def search_hackathon_projects(self, limit=500, min_stars=10):
        """
        Search for hackathon projects on GitHub.

        Args:
            limit (int): Maximum number of repositories to analyze
            min_stars (int): Minimum stars to consider

        Returns:
            list: List of project data dictionaries
        """
        query_terms = [
            "hackathon project",
            "hackathon submission",
            "hackathon winner",
            "24 hour hackathon",
            "hackathon demo",
            "hackday project"
        ]

        all_projects = []

        for query in query_terms:
            params = {
                "q": f"{query} stars:>={min_stars}",
                "sort": "stars",
                "order": "desc",
                "per_page": 100
            }

            page = 1
            while len(all_projects) < limit:
                params["page"] = page

                try:
                    response = requests.get(
                        f"{self.base_url}/search/repositories",
                        headers=self.headers,
                        params=params
                    )

                    if response.status_code == 200:
                        data = response.json()
                        items = data.get("items", [])

                        if not items:
                            break

                        for repo in items:
                            # Extract project info
                            project = {
                                "name": repo["name"],
                                "full_name": repo["full_name"],
                                "description": repo["description"] or "",
                                "url": repo["html_url"],
                                "stars": repo["stargazers_count"],
                                "language": repo["language"],
                                "topics": repo.get("topics", []),
                                "created_at": repo["created_at"]
                            }

                            # Avoid duplicates
                            if project["full_name"] not in [p["full_name"] for p in all_projects]:
                                all_projects.append(project)

                                if len(all_projects) >= limit:
                                    break

                        page += 1

                        # Handle rate limiting
                        remaining = int(response.headers.get("X-RateLimit-Remaining", 0))
                        if remaining < 5:
                            reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
                            sleep_time = max(0, reset_time - time.time()) + 1
                            print(f"Rate limit approaching. Sleeping for {sleep_time:.2f} seconds")
                            time.sleep(sleep_time)

                    elif response.status_code == 403:
                        # Rate limited
                        reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
                        sleep_time = max(0, reset_time - time.time()) + 1
                        print(f"Rate limited. Sleeping for {sleep_time:.2f} seconds")
                        time.sleep(sleep_time)

                    else:
                        print(f"Error: {response.status_code} - {response.text}")
                        break

                except Exception as e:
                    print(f"Exception occurred: {e}")
                    break

        return all_projects

    def extract_project_keywords(self, project):
        """
        Extract meaningful keywords from project name and description.

        Args:
            project (dict): Project information dictionary

        Returns:
            list: List of keywords
        """
        # Combine name and description
        text = f"{project['name']} {project['description']}"

        # Tokenize and clean
        tokens = self.word_tokenize(text.lower())

        # Remove stop words and short/irrelevant terms
        keywords = [
            word for word in tokens
            if word.isalpha() and
            word not in self.stop_words and
            len(word) > 2 and
            word not in ["hackathon", "project", "app", "application"]
        ]

        # Add explicit topics and language
        if project["language"] and project["language"].lower() not in keywords:
            keywords.append(project["language"].lower())

        keywords.extend([topic.lower() for topic in project["topics"]])

        return keywords

    def analyze_without_nltk(self, projects):
        """
        Analyze projects without relying on NLTK (fallback method).

        Args:
            projects (list): List of project dictionaries

        Returns:
            dict: Analysis results
        """
        results = {}

        # Analyze languages
        languages = [p["language"] for p in projects if p["language"]]
        results["top_languages"] = Counter(languages).most_common(10)

        # Analyze topics/tags
        all_topics = []
        for project in projects:
            all_topics.extend(project["topics"])
        results["top_topics"] = Counter(all_topics).most_common(15)

        # Simple keyword extraction without NLTK
        common_words = set([
            "and", "the", "a", "an", "in", "on", "at", "to", "for", "of", "with",
            "by", "from", "about", "as", "that", "this", "is", "are", "was", "were",
            "be", "been", "being", "have", "has", "had", "do", "does", "did", "will",
            "would", "should", "could", "can", "may", "might", "must", "project", "app",
            "hackathon", "code", "application", "demo", "create", "using", "use", "used"
        ])

        all_keywords = []
        for project in projects:
            # Simple word extraction without NLTK
            words = re.findall(r'\b[a-zA-Z]{3,}\b', f"{project['name']} {project['description']}")
            keywords = [word.lower() for word in words if word.lower() not in common_words]
            all_keywords.extend(keywords)
            all_keywords.extend([topic.lower() for topic in project["topics"]])
            if project["language"]:
                all_keywords.append(project["language"].lower())

        results["top_keywords"] = Counter(all_keywords).most_common(20)

        return results

    def export_simplified_results(self, projects, trends, filename_prefix="hackathon_projects"):
        """
        Export simplified analysis results to CSV files.

        Args:
            projects (list): List of project dictionaries
            trends (dict): Trend analysis results
            filename_prefix (str): Prefix for output filenames
        """
        # Export projects
        df_projects = pd.DataFrame(projects)
        if "topics" in df_projects.columns:
            df_projects["topics"] = df_projects["topics"].apply(lambda x: ", ".join(x) if x else "")
        df_projects.to_csv(f"{filename_prefix}_data.csv", index=False)

        # Group projects by language
        language_groups = {}
        for project in projects:
            lang = project.get("language") or "Unknown"
            if lang not in language_groups:
                language_groups[lang] = []
            language_groups[lang].append(project)

        language_data = []
        for lang, projs in language_groups.items():
            language_data.append({
                "language": lang,
                "count": len(projs),
                "average_stars": sum(p["stars"] for p in projs) / len(projs),
                "examples": ", ".join([p["full_name"] for p in sorted(projs, key=lambda x: x["stars"], reverse=True)[:3]])
            })

        df_languages = pd.DataFrame(language_data)
        df_languages.to_csv(f"{filename_prefix}_by_language.csv", index=False)

        # Group projects by topic
        topic_data = []
        for topic, count in trends["top_topics"]:
            topic_projects = [p for p in projects if topic in p["topics"]]
            topic_data.append({
                "topic": topic,
                "count": count,
                "examples": ", ".join([p["full_name"] for p in sorted(topic_projects, key=lambda x: x["stars"], reverse=True)[:3]])
            })

        df_topics = pd.DataFrame(topic_data)
        df_topics.to_csv(f"{filename_prefix}_by_topic.csv", index=False)

        # Export keyword trends
        keyword_data = []
        for keyword, count in trends["top_keywords"]:
            keyword_data.append({
                "keyword": keyword,
                "count": count
            })

        df_keywords = pd.DataFrame(keyword_data)
        df_keywords.to_csv(f"{filename_prefix}_keywords.csv", index=False)

        print(f"Data exported to {filename_prefix}_*.csv files")

def main():
    # You can get a GitHub token by following instructions at:
    # https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
    token = os.environ.get("GITHUB_TOKEN")  # Or replace with your token

    analyzer = HackathonProjectAnalyzer(token=token)

    print("Searching for hackathon projects...")
    projects = analyzer.search_hackathon_projects(limit=300)

    if projects:
        print(f"Found {len(projects)} hackathon projects")

        try:
            # Try to use NLTK features
            print("Analyzing project keywords...")
            all_keywords = []
            for project in projects:
                keywords = analyzer.extract_project_keywords(project)
                all_keywords.extend(keywords)

            keyword_counts = Counter(all_keywords).most_common(20)
            print("\nTop project keywords:")
            for keyword, count in keyword_counts:
                print(f"{keyword}: {count} occurrences")

            # Try to cluster if sklearn is available
            try:
                from sklearn.feature_extraction.text import TfidfVectorizer
                from sklearn.cluster import KMeans

                # Create documents from project descriptions
                documents = []
                for project in projects:
                    doc = f"{project['name']} {project['description']} {' '.join(project['topics'])}"
                    documents.append(doc)

                # Use TF-IDF to vectorize documents
                vectorizer = TfidfVectorizer(
                    max_features=1000,
                    stop_words='english',
                    ngram_range=(1, 2)
                )

                X = vectorizer.fit_transform(documents)

                # Perform KMeans clustering
                kmeans = KMeans(n_clusters=10, random_state=42)
                kmeans.fit(X)

                # Assign clusters to projects
                for i, project in enumerate(projects):
                    project["cluster"] = int(kmeans.labels_[i])

                # Group projects by cluster
                clusters = {}
                for project in projects:
                    cluster_id = project["cluster"]
                    if cluster_id not in clusters:
                        clusters[cluster_id] = []
                    clusters[cluster_id].append(project)

                print("\nProject clusters:")
                for cluster_id, cluster_projects in clusters.items():
                    print(f"Cluster {cluster_id} ({len(cluster_projects)} projects):")
                    for p in sorted(cluster_projects, key=lambda x: x["stars"], reverse=True)[:3]:
                        print(f"  - {p['full_name']}: {p['description'][:100]}...")

            except ImportError:
                print("sklearn not available, skipping clustering")

        except Exception as e:
            print(f"Error using NLTK: {e}")
            print("Falling back to simplified analysis")

        # Always perform basic analysis
        print("\nAnalyzing project trends...")
        trends = analyzer.analyze_without_nltk(projects)

        print("\nTop programming languages:")
        for language, count in trends["top_languages"]:
            print(f"{language}: {count} projects")

        print("\nMost common hackathon topics/tags:")
        for topic, count in trends["top_topics"]:
            print(f"{topic}: {count} occurrences")

        # Export data
        analyzer.export_simplified_results(projects, trends)

if __name__ == "__main__":
    main()

Downloading required NLTK resources...
Searching for hackathon projects...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Rate limit approaching. Sleeping for 51.53 seconds
Found 300 hackathon projects
Analyzing project keywords...
Error using NLTK: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

Falling back to simplified analysis

Analyzing project trends...

Top programming languages:
JavaScript: 57 projects
Python: 42 projects
TypeScript: 34 projects
HTML: 23 projects
Jupyter Notebook: 22 proj

In [10]:
# Add this to the end of your main() function
print("\nPrinting contents of the CSV files:")

# Print projects data
print("\n=== PROJECTS DATA ===")
projects_df = pd.read_csv("hackathon_projects_data.csv")
print(projects_df.head())  # Print first 5 rows

# Print language data
print("\n=== LANGUAGE DATA ===")
lang_df = pd.read_csv("hackathon_projects_by_language.csv")
print(lang_df)  # Print all rows (usually not too many)

# Print topic data
print("\n=== TOPIC DATA ===")
topic_df = pd.read_csv("hackathon_projects_by_topic.csv")
print(topic_df)  # Print all rows


Printing contents of the CSV files:

=== PROJECTS DATA ===
                           name                              full_name  \
0           graphql-starter-kit           kriasoft/graphql-starter-kit   
1    awesome-hackathon-projects  Olanetsoft/awesome-hackathon-projects   
2                          Otto                       KartikChugh/Otto   
3   mlh-hackathon-flask-starter        MLH/mlh-hackathon-flask-starter   
4  mlh-hackathon-nodejs-starter       MLH/mlh-hackathon-nodejs-starter   

                                         description  \
0  💥  Monorepo template (seed project) pre-config...   
1  This is a curated list of amazing hackathon pr...   
2  Otto makes machine learning an intuitive, natu...   
3   Hackathon starter project for Flask applications   
4  Hackathon starter project for Node.js applicat...   

                                                 url  stars    language  \
0    https://github.com/kriasoft/graphql-starter-kit   3919  TypeScript   
1  https